From 4d7b699215718d845c346bb4e27e8f5ebbc78f2d Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 24 Jun 2022 14:37:31 +0200 Subject: [PATCH 001/140] Initial commit: copied everything into a single file --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 6149 +++++++++++++++++++++++ 1 file changed, 6149 insertions(+) create mode 100644 cpp/include/raft/spatial/knn/ivf_pq.cuh diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh new file mode 100644 index 0000000000..d7ead0fa6a --- /dev/null +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -0,0 +1,6149 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include +#include + +/////////////////// +#include +#include +#include +#include +#include + +////////////////// + +#define CUANN_DEBUG + +namespace raft::spatial::knn::ivf_pq { + +/** + * + * + * + * + * + * fp_8bit + */ + +template +struct fp_8bit; + +template +__device__ __host__ fp_8bit __float2fp_8bit(const float v); +template +__device__ __host__ float __fp_8bit2float(const fp_8bit& v); + +template +struct fp_8bit { + uint8_t bitstring; + + __device__ __host__ fp_8bit(const uint8_t bs) { bitstring = bs; } + __device__ __host__ fp_8bit(const float fp) + { + bitstring = __float2fp_8bit(fp).bitstring; + } + __device__ __host__ fp_8bit& operator=(const float fp) + { + bitstring = __float2fp_8bit(fp).bitstring; + return *this; + } + + __device__ __host__ operator float() const { return __fp_8bit2float(*this); } +}; + +// Since __float_as_uint etc can not be used in host codes, +// these converters are needed for test. +union cvt_fp_32bit { + float fp; + uint32_t bs; +}; +union cvt_fp_16bit { + half fp; + uint16_t bs; +}; + +// Type converters +template +__device__ __host__ fp_8bit __float2fp_8bit(const float v) +{ + if (v < 1. / (1u << ((1u << (expBitLen - 1)) - 1))) + return fp_8bit{static_cast(0)}; + return fp_8bit{static_cast( + (cvt_fp_32bit{.fp = v}.bs + (((1u << (expBitLen - 1)) - 1) << 23) - 0x3f800000u) >> + (15 + expBitLen))}; +} + +template +__device__ __host__ float __fp_8bit2float(const fp_8bit& v) +{ + return cvt_fp_32bit{.bs = ((v.bitstring << (15 + expBitLen)) + + (0x3f800000u | (0x00400000u >> (8 - expBitLen))) - + (((1u << (expBitLen - 1)) - 1) << 23))} + .fp; +} + +/** + * + * end of fp8bit + * + */ + +using namespace cub; + +// +extern __shared__ float smemArray[]; + +#define FP16_MAX 65504.0 + +/* CUANN status type */ +typedef enum { + CUANN_STATUS_SUCCESS = 0, + CUANN_STATUS_ALLOC_FAILED = 1, + CUANN_STATUS_NOT_INITIALIZED = 2, + CUANN_STATUS_INVALID_VALUE = 3, + CUANN_STATUS_INTERNAL_ERROR = 4, + CUANN_STATUS_FILEIO_ERROR = 5, + CUANN_STATUS_CUDA_ERROR = 6, + CUANN_STATUS_CUBLAS_ERROR = 7, + CUANN_STATUS_INVALID_POINTER = 8, + CUANN_STATUS_VERSION_ERROR = 9, + CUANN_STATUS_UNSUPPORTED_DTYPE = 10, +} cuannStatus_t; + +/* CUANN similarity type */ +typedef enum { + CUANN_SIMILARITY_INNER = 0, + CUANN_SIMILARITY_L2 = 1, +} cuannSimilarity_t; + +/* CUANN PQ center type */ +typedef enum { + CUANN_PQ_CENTER_PER_SUBSPACE = 0, + CUANN_PQ_CENTER_PER_CLUSTER = 1, +} cuannPqCenter_t; + +/* Context */ +struct cuannContext { + int devId; + cudaStream_t stream; + cudaDeviceProp deviceProp; + cublasHandle_t cublasHandle; + + int numDevices; + cudaStream_t* streams; + cudaDeviceProp* deviceProps; + cublasHandle_t* cublasHandles; +}; +typedef struct cuannContext* cuannHandle_t; + +/* IvfPq */ +struct cuannIvfPqDescriptor { + uint32_t numClusters; + uint32_t numDataset; + uint32_t dimDataset; + uint32_t dimDatasetExt; + uint32_t dimRotDataset; + uint32_t dimPq; + uint32_t bitPq; + cuannSimilarity_t similarity; + cuannPqCenter_t typePqCenter; + cudaDataType_t dtypeDataset; + cudaDataType_t internalDistanceDtype; + cudaDataType_t smemLutDtype; + uint32_t indexVersion; + uint32_t maxClusterSize; + uint32_t lenPq; // dimRotDataset / dimPq + uint32_t numProbes; + uint32_t topK; + uint32_t maxQueries; + uint32_t maxBatchSize; + uint32_t maxSamples; + uint32_t* inclusiveSumSortedClusterSize; // [numClusters,] + float* sqsumClusters; // [numClusters,] + size_t sizeCubWorkspace; + uint32_t _numClustersSize0; // (*) urgent WA, need to be fixed + uint32_t preferredThreadBlockSize; +}; +typedef struct cuannIvfPqDescriptor* cuannIvfPqDescriptor_t; + +// header of index +struct cuannIvfPqIndexHeader { + // (*) DO NOT CHANGE ORDER + size_t indexSize; + uint32_t version; + uint32_t numClusters; + uint32_t numDataset; + uint32_t dimDataset; + uint32_t dimPq; + uint32_t similarity; + uint32_t maxClusterSize; + uint32_t dimRotDataset; + uint32_t bitPq; + uint32_t typePqCenter; + uint32_t dtypeDataset; + uint32_t dimDatasetExt; + uint32_t numDatasetAdded; + uint32_t _dummy[256 - 15]; +}; + +// +char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) +{ + if (dtype == CUDA_R_32F) + sprintf(string, "float (CUDA_R_32F)"); + else if (dtype == CUDA_R_16F) + sprintf(string, "half (CUDA_R_16F)"); + else if (dtype == CUDA_R_8U) + sprintf(string, "uint8 (CUDA_R_8U)"); + else if (dtype == CUDA_R_8I) + sprintf(string, "int8 (CUDA_R_8I)"); + else + sprintf(string, "unknown"); + return string; +} + +// +size_t _cuann_aligned(size_t size, size_t unit = 128) +{ + if (size % unit) { size += unit - (size % unit); } + return size; +} + +// memset +void _cuann_memset(void* ptr, int value, size_t count) +{ + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, ptr); + if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { + cudaError_t ret = cudaMemset(ptr, value, count); + if (ret != cudaSuccess) { + fprintf(stderr, "(%s) cudaMemset() failed\n", __func__); + exit(-1); + } + } else { + memset(ptr, value, count); + } +} + +// square sum along column +__global__ void kern_sqsum(uint32_t nRows, + uint32_t nCols, + const float* a, // [nRows, nCols] + float* out // [nRows] +) +{ + uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x); + if (iRow >= nRows) return; + + float sqsum = 0.0; + for (uint64_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + float val = a[iCol + (nCols * iRow)]; + sqsum += val * val; + } + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); + if (threadIdx.x == 0) { out[iRow] = sqsum; } +} + +// square sum along column +void _cuann_sqsum(uint32_t nRows, + uint32_t nCols, + const float* a, // [numDataset, dimDataset] + float* out // [numDataset,] +) +{ + dim3 threads(32, 4, 1); // DO NOT CHANGE + dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1); + kern_sqsum<<>>(nRows, nCols, a, out); +} + +// outer add +__global__ void kern_outer_add(const float* a, + uint32_t numA, + const float* b, + uint32_t numB, + float* c // [numA, numB] +) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iA = gid / numB; + uint64_t iB = gid % numB; + if (iA >= numA) return; + float valA = (a == NULL) ? 0.0 : a[iA]; + float valB = (b == NULL) ? 0.0 : b[iB]; + c[gid] = valA + valB; +} + +// outer add +void _cuann_outer_add(const float* a, + uint32_t numA, + const float* b, + uint32_t numB, + float* c // [numA, numB] +) +{ + dim3 threads(128, 1, 1); + dim3 blocks(((uint64_t)numA * numB + threads.x - 1) / threads.x, 1, 1); + kern_outer_add<<>>(a, numA, b, numB, c); +} + +// argmin along column +__global__ void kern_argmin(uint32_t nRows, + uint32_t nCols, + const float* a, // [nRows, nCols] + uint32_t* out // [nRows] +) +{ + __shared__ uint32_t smCol[1024]; + __shared__ float smVal[1024]; + uint32_t iRow = blockIdx.x; + if (iRow >= nRows) return; + uint32_t iCol = threadIdx.x; + uint32_t minCol = nCols; + float minVal = FLT_MAX; + for (iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + if (minVal > a[iCol + (nCols * iRow)]) { + minVal = a[iCol + (nCols * iRow)]; + minCol = iCol; + } + } + smVal[threadIdx.x] = minVal; + smCol[threadIdx.x] = minCol; + __syncthreads(); + for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset) { + if (smVal[threadIdx.x] < smVal[threadIdx.x + offset]) { + } else if (smVal[threadIdx.x] > smVal[threadIdx.x + offset]) { + smVal[threadIdx.x] = smVal[threadIdx.x + offset]; + smCol[threadIdx.x] = smCol[threadIdx.x + offset]; + } else if (smCol[threadIdx.x] > smCol[threadIdx.x + offset]) { + smCol[threadIdx.x] = smCol[threadIdx.x + offset]; + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { out[iRow] = smCol[0]; } +} + +// argmin along column +void _cuann_argmin(uint32_t nRows, + uint32_t nCols, + const float* a, // [nRows, nCols] + uint32_t* out // [nRows] +) +{ + uint32_t nThreads = 1024; + while (nThreads > nCols) { + nThreads /= 2; + } + nThreads = max(nThreads, 128); + kern_argmin<<>>(nRows, nCols, a, out); +} + +// copy +template +__global__ void kern_copy(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D divisor) +{ + uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint32_t iCol = gid % nCols; + uint32_t iRow = gid / nCols; + if (iRow >= nRows) return; + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; +} + +// copy +template +void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D divisor) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; + kern_copy<<>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor); +} + +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const float* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const uint32_t* src, + uint32_t ldSrc, + uint8_t* dst, + uint32_t ldDst, + uint8_t divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const uint8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const int8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); + +// copy_CPU +template +void _cuann_copy_CPU(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst) +{ + for (uint32_t ir = 0; ir < nRows; ir++) { + for (uint32_t ic = 0; ic < nCols; ic++) { + dst[ic + (ldDst * ir)] = src[ic + (ldSrc * ir)]; + } + } +} + +template void _cuann_copy_CPU( + uint32_t nRows, uint32_t nCols, const float* src, uint32_t ldSrc, float* dst, uint32_t ldDst); + +// copy_fill +template +__global__ void kern_copy_fill(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D fillValue, + D divisor) +{ + uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint32_t iCol = gid % ldDst; + uint32_t iRow = gid / ldDst; + if (iRow >= nRows) return; + if (iCol < nCols) { + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; + } else { + dst[iCol + (ldDst * iRow)] = fillValue; + } +} + +// copy_fill +template +void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D fillValue, + D divisor, + cudaStream_t stream) +{ + assert(ldSrc >= nCols); + assert(ldDst >= nCols); + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * ldDst) + nThreads - 1) / nThreads; + kern_copy_fill + <<>>(nRows, nCols, src, ldSrc, dst, ldDst, fillValue, divisor); +} + +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const float* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const uint8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const int8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); + +// copy with row list +template +__global__ void kern_copy_with_list(uint32_t nRows, + uint32_t nCols, + const T* src, // [..., ldSrc] + const uint32_t* rowList, // [nRows,] + uint32_t ldSrc, + float* dst, // [nRows, ldDst] + uint32_t ldDst, + float divisor) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iCol = gid % nCols; + uint64_t iRow = gid / nCols; + if (iRow >= nRows) return; + uint64_t iaRow = rowList[iRow]; + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor; +} + +// copy with row list +template +void _cuann_copy_with_list(uint32_t nRows, + uint32_t nCols, + const T* src, // [..., ldSrc] + const uint32_t* rowList, // [nRows,] + uint32_t ldSrc, + float* dst, // [nRows, ldDst] + uint32_t ldDst, + float divisor = 1.0f) +{ + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, src); + if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { + for (uint64_t iRow = 0; iRow < nRows; iRow++) { + uint64_t iaRow = rowList[iRow]; + for (uint64_t iCol = 0; iCol < nCols; iCol++) { + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor; + } + } + } else { + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; + kern_copy_with_list + <<>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor); + } +} + +template void _cuann_copy_with_list(uint32_t nRows, + uint32_t nCols, + const float* src, + const uint32_t* rowList, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy_with_list(uint32_t nRows, + uint32_t nCols, + const uint8_t* src, + const uint32_t* rowList, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy_with_list(uint32_t nRows, + uint32_t nCols, + const int8_t* src, + const uint32_t* rowList, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); + +// a -= b +__global__ void kern_a_me_b(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + uint32_t ldA, + float* b // [nCols] +) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iCol = gid % nCols; + uint64_t iRow = gid / nCols; + if (iRow >= nRows) return; + a[iCol + (ldA * iRow)] -= b[iCol]; +} + +// a -= b +void _cuann_a_me_b(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + uint32_t ldA, + float* b // [nCols] +) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; + kern_a_me_b<<>>(nRows, nCols, a, ldA, b); +} + +// accumulate +template +__global__ void kern_accumulate_with_label(uint32_t nRowsOutput, + uint32_t nCols, + float* output, // [nRowsOutput, nCols,] + uint32_t* count, // [nRowsOutput,] + uint32_t nRowsInput, + const T* input, // [nRowsInput, nCols,] + const uint32_t* label, // [nRowsInput,] + float divisor) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iCol = gid % nCols; + uint64_t iRowInput = gid / nCols; + if (iRowInput >= nRowsInput) return; + uint64_t iRowOutput = label[iRowInput]; + if (iCol == 0) { atomicAdd(&(count[iRowOutput]), 1); } + atomicAdd(&(output[iCol + (nCols * iRowOutput)]), input[gid] / divisor); +} + +// accumulate +template +void _cuann_accumulate_with_label(uint32_t nRowsOutput, + uint32_t nCols, + float* output, // [nRowsOutput, nCols,] + uint32_t* count, // [nRowsOutput,] + uint32_t nRowsInput, + const T* input, // [nRowsInput, nCols,] + const uint32_t* label, // [nRowsInput,] + float divisor = 1.0) +{ + bool useGPU = 1; + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, output); + if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } + cudaPointerGetAttributes(&attr, count); + if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } + cudaPointerGetAttributes(&attr, input); + if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } + // _cuann_memset(output, 0, sizeof(float) * nRowsOutput * nCols); + // _cuann_memset(count, 0, sizeof(uint32_t) * nRowsOutput); + + if (useGPU) { + // GPU + uint32_t nThreads = 128; + uint64_t nBlocks = (((uint64_t)nRowsInput * nCols) + nThreads - 1) / nThreads; + kern_accumulate_with_label + <<>>(nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor); + } else { + // CPU + cudaDeviceSynchronize(); + for (uint64_t i = 0; i < nRowsInput; i++) { + uint64_t l = label[i]; + count[l] += 1; + for (uint64_t j = 0; j < nCols; j++) { + output[j + (nCols * l)] += input[j + (nCols * i)] / divisor; + } + } + } +} + +// normalize +__global__ void kern_normalize(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x); + if (iRow >= nRows) return; + if (numSamples != NULL and numSamples[iRow] < 1) return; + + float sqsum = 0.0; + for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + float val = a[iCol + (nCols * iRow)]; + sqsum += val * val; + } + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); + sqsum = sqrt(sqsum); + for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + a[iCol + (nCols * iRow)] /= sqsum; + } +} + +// normalize +void _cuann_normalize(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples = nullptr // [nRows,] +) +{ + dim3 threads(32, 4, 1); // DO NOT CHANGE + dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1); + kern_normalize<<>>(nRows, nCols, a, numSamples); +} + +// divide +__global__ void kern_divide(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iRow = gid / nCols; + if (iRow >= nRows) return; + if (numSamples[iRow] == 0) return; + a[gid] /= numSamples[iRow]; +} + +// divide +void _cuann_divide(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + dim3 threads(128, 1, 1); + dim3 blocks(((uint64_t)nRows * nCols + threads.x - 1) / threads.x, 1, 1); + kern_divide<<>>(nRows, nCols, a, numSamples); +} + +// +template +__global__ void kern_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + D* dst, // [num2, ld1, ld0] + uint32_t ld0, + uint32_t ld1, + const S* src, // [...] + uint32_t stride0, + uint32_t stride1, + uint32_t stride2) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid >= num0 * num1 * num2) return; + uint32_t i0 = tid % num0; + uint32_t i1 = (tid / num0) % num1; + uint32_t i2 = (tid / num0) / num1; + + dst[i0 + (ld0 * i1) + (ld0 * ld1 * i2)] = src[(stride0 * i0) + (stride1 * i1) + (stride2 * i2)]; +} + +// transpose_copy_3d +template +void _cuann_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + D* dst, // [num2, ld1, ld0] + uint32_t ld0, + uint32_t ld1, + const S* src, // [...] + uint32_t stride0, + uint32_t stride1, + uint32_t stride2) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((num0 * num1 * num2) + nThreads - 1) / nThreads; + kern_transpose_copy_3d + <<>>(num0, num1, num2, dst, ld0, ld1, src, stride0, stride1, stride2); +} + +template void _cuann_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + float* dst, + uint32_t ld0, + uint32_t ld1, + const float* src, + uint32_t stride0, + uint32_t stride1, + uint32_t stride2); + +// +template +__global__ void kern_axpy(int num, T alpha, const T* x, T* y) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid >= num) return; + y[tid] += alpha * x[tid]; +} + +// +template +void _cuann_axpy(int num, T alpha, const T* x, T* y) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = (num + nThreads - 1) / nThreads; + kern_axpy<<>>(num, alpha, x, y); +} + +template void _cuann_axpy(int num, float alpha, const float* x, float* y); +template void _cuann_axpy(int num, uint32_t alpha, const uint32_t* x, uint32_t* y); + +// +template +T** _cuann_multi_device_malloc(int numDevices, + size_t numArrayElements, + const char* arrayName, + bool useCudaMalloc = false // If true, cudaMalloc() used, + // otherwise, cudaMallocManaged() used. +) +{ + cudaError_t cudaError; + int orgDevId; + cudaError = cudaGetDevice(&orgDevId); + if (cudaError != cudaSuccess) { + fprintf( + stderr, "(%s, %d) cudaGetDevice() failed (arrayName: %s).\n", __func__, __LINE__, arrayName); + exit(-1); + } + T** arrays = (T**)malloc(sizeof(T*) * numDevices); + for (int devId = 0; devId < numDevices; devId++) { + cudaError = cudaSetDevice(devId); + if (cudaError != cudaSuccess) { + fprintf(stderr, + "(%s, %d) cudaSetDevice() failed (arrayName: %s).\n", + __func__, + __LINE__, + arrayName); + exit(-1); + } + if (useCudaMalloc) { + cudaError = cudaMalloc(&(arrays[devId]), sizeof(T) * numArrayElements); + if (cudaError != cudaSuccess) { + fprintf( + stderr, "(%s, %d) cudaMalloc() failed (arrayName: %s).\n", __func__, __LINE__, arrayName); + exit(-1); + } + } else { + cudaError = cudaMallocManaged(&(arrays[devId]), sizeof(T) * numArrayElements); + if (cudaError != cudaSuccess) { + fprintf(stderr, + "(%s, %d) cudaMallocManaged() failed (arrayName: %s).\n", + __func__, + __LINE__, + arrayName); + exit(-1); + } + } + } + cudaError = cudaSetDevice(orgDevId); + if (cudaError != cudaSuccess) { + fprintf( + stderr, "(%s, %d) cudaSetDevice() failed (arrayName: %s)\n", __func__, __LINE__, arrayName); + exit(-1); + } + return arrays; +} + +// multi_device_free +template +void _cuann_multi_device_free(T** arrays, int numDevices) +{ + for (int devId = 0; devId < numDevices; devId++) { + cudaFree(arrays[devId]); + } + free(arrays); +} + +template void _cuann_multi_device_free(float** arrays, int numDevices); +template void _cuann_multi_device_free(uint32_t** arrays, int numDevices); +template void _cuann_multi_device_free(uint8_t** arrays, int numDevices); + +/** + * End of utils + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * start of kmeans + */ + +// update kmeans centers +void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + uint32_t* clusterSize, // [numCenters] + float* accumulatedCenters) +{ + if (accumulatedCenters == NULL) { + // accumulate + _cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters); + _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + if (dtype == CUDA_R_32F) { + _cuann_accumulate_with_label( + numCenters, dimCenters, centers, clusterSize, numDataset, (const float*)dataset, labels); + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + _cuann_accumulate_with_label(numCenters, + dimCenters, + centers, + clusterSize, + numDataset, + (const uint8_t*)dataset, + labels, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + _cuann_accumulate_with_label(numCenters, + dimCenters, + centers, + clusterSize, + numDataset, + (const int8_t*)dataset, + labels, + divisor); + } + } else { + cudaMemcpy( + centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault); + } + + if (similarity == CUANN_SIMILARITY_INNER) { + // normalize + _cuann_normalize(numCenters, dimCenters, centers, clusterSize); + } else { + // average + _cuann_divide(numCenters, dimCenters, centers, clusterSize); + } +} + +// +static cudaStream_t _cuann_set_cublas_stream(cublasHandle_t cublasHandle, cudaStream_t stream) +{ + cublasStatus_t cublasError; + cudaStream_t cublasStream; + cublasError = cublasGetStream(cublasHandle, &cublasStream); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGetStream() failed.\n", __func__, __LINE__); + exit(-1); + } + cublasError = cublasSetStream(cublasHandle, stream); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasSetStream() failed.\n", __func__, __LINE__); + exit(-1); + } + return cublasStream; +} + +// predict label of dataset +void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle, + const float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const float* dataset, // [numDataset, dimCenters] + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + float* workspace) +{ + cublasStatus_t cublasError; + const uint32_t dimDataset = dimCenters; + float* sqsumCenters; // [numCenters] + float* sqsumDataset; // [numDataset] + float* distances; // [numDataset, numCenters] + + sqsumCenters = workspace; + sqsumDataset = sqsumCenters + numCenters; + distances = sqsumDataset + numDataset; + + float alpha; + float beta; + if (similarity == CUANN_SIMILARITY_INNER) { + alpha = -1.0; + beta = 0.0; + } else { + _cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters); + _cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset); + _cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances); + alpha = -2.0; + beta = 1.0; + } + cudaStream_t cublasStream = _cuann_set_cublas_stream(cublasHandle, NULL); + cublasError = cublasGemmEx(cublasHandle, + CUBLAS_OP_T, + CUBLAS_OP_N, + numCenters, + numDataset, + dimCenters, + &alpha, + centers, + CUDA_R_32F, + dimCenters, + dataset, + CUDA_R_32F, + dimDataset, + &beta, + distances, + CUDA_R_32F, + numCenters, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); + exit(-1); + } + _cuann_set_cublas_stream(cublasHandle, cublasStream); + _cuann_argmin(numDataset, numCenters, distances, labels); +} + +// +uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset) +{ + uint32_t chunk = (1 << 20); + if (chunk > (1 << 28) / numCenters) { + chunk = (1 << 28) / numCenters; + if (chunk > 31) { + chunk += 32; + chunk -= chunk % 64; + } else { + chunk = 64; + } + } + chunk = min(chunk, numDataset); + return chunk; +} + +// +size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, + uint32_t dimCenters, + uint32_t numDataset) +{ + uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); + size_t size = 0; + // float *curDataset; // [chunk, dimCenters] + size += _cuann_aligned(sizeof(float) * chunk * dimCenters); + // void *bufDataset; // [chunk, dimCenters] + size += _cuann_aligned(sizeof(float) * chunk * dimCenters); + // float *workspace; + size += _cuann_aligned(sizeof(float) * (numCenters + chunk + (numCenters * chunk))); + return size; +} + +// predict label of dataset +void _cuann_kmeans_predict(cublasHandle_t cublasHandle, + float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + bool isCenterSet, + void* _workspace, + float* tempCenters, // [numCenters, dimCenters] + uint32_t* clusterSize, // [numCenters,] + bool updateCenter) +{ + if (!isCenterSet) { + // If centers are not set, the labels will be determined randomly. + for (uint32_t i = 0; i < numDataset; i++) { + labels[i] = i % numCenters; + } + if (tempCenters != NULL && clusterSize != NULL) { + // update centers + _cuann_kmeans_update_centers(centers, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + similarity, + clusterSize, + nullptr); + } + return; + } + + cudaError_t cudaError; + uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); + void* workspace = _workspace; + if (_workspace == NULL) { + size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); + cudaError = cudaMallocManaged(&workspace, sizeWorkspace); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + exit(-1); + } + } + float* curDataset; // [chunk, dimCenters] + void* bufDataset; // [chunk, dimCenters] + float* workspace_core; + curDataset = (float*)workspace; + bufDataset = (void*)((uint8_t*)curDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + workspace_core = + (float*)((uint8_t*)bufDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + + if (tempCenters != NULL && clusterSize != NULL) { + _cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters); + _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + } + + cudaMemcpyKind kind; + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, dataset); + if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { + kind = cudaMemcpyDeviceToDevice; + } else { + kind = cudaMemcpyHostToDevice; + } + + for (uint64_t is = 0; is < numDataset; is += chunk) { + uint64_t ie = min(is + chunk, (uint64_t)numDataset); + uint32_t nDataset = ie - is; + + if (dtype == CUDA_R_32F) { + cudaError = cudaMemcpyAsync(bufDataset, + (float*)dataset + (is * dimCenters), + sizeof(float) * nDataset * dimCenters, + kind, + NULL); + } else if (dtype == CUDA_R_8U) { + cudaError = cudaMemcpyAsync(bufDataset, + (uint8_t*)dataset + (is * dimCenters), + sizeof(uint8_t) * nDataset * dimCenters, + kind, + NULL); + } else if (dtype == CUDA_R_8I) { + cudaError = cudaMemcpyAsync(bufDataset, + (int8_t*)dataset + (is * dimCenters), + sizeof(int8_t) * nDataset * dimCenters, + kind, + NULL); + } + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMemcpy() failed.\n", __func__, __LINE__); + exit(-1); + } + + if (dtype == CUDA_R_32F) { +#if 0 + _cuann_copy(nDataset, dimCenters, + (const float*)bufDataset, dimCenters, + curDataset, dimCenters); +#else + // No need to copy when dtype is CUDA_R_32F + curDataset = (float*)bufDataset; +#endif + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + _cuann_copy(nDataset, + dimCenters, + (const uint8_t*)bufDataset, + dimCenters, + curDataset, + dimCenters, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + _cuann_copy(nDataset, + dimCenters, + (const int8_t*)bufDataset, + dimCenters, + curDataset, + dimCenters, + divisor); + } + + // predict + _cuann_kmeans_predict_core(cublasHandle, + centers, + numCenters, + dimCenters, + curDataset, + nDataset, + labels + is, + similarity, + workspace_core); + + if ((tempCenters != NULL) && (clusterSize != NULL)) { + // accumulate + _cuann_accumulate_with_label( + numCenters, dimCenters, tempCenters, clusterSize, nDataset, curDataset, labels + is); + } +#if 0 + // debug + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", + __func__, __LINE__); + exit(-1); + } +#endif + } + + if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) { + _cuann_kmeans_update_centers(centers, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + similarity, + clusterSize, + tempCenters); + } + + if (_workspace == NULL) { cudaFree(workspace); } +} + +// +// predict label of dataset with multiple devices +// +void _cuann_kmeans_predict_MP(int numDevices, + cublasHandle_t* cublasHandles, // [numDevices] + float* clusterCenters, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + bool isCenterSet, + uint32_t* clusterSize, // [numCenters] + bool updateCenter // If true, cluster Centers will be updated. +) +{ + // [numDevices][numCenters, dimCenters] + float** clusterCentersCopy = _cuann_multi_device_malloc( + numDevices, numCenters * dimCenters, "clusterCentersCopy", true /* use cudaMalloc() */); + + // [numDevices][numCenters, dimCenters] + float** clusterCentersMP = + _cuann_multi_device_malloc(numDevices, numCenters * dimCenters, "clusterCentersMP"); + + // [numDevices][numCenters] + uint32_t** clusterSizeMP = + _cuann_multi_device_malloc(numDevices, numCenters, "clusterSizeMP"); + + // [numDevices][...] + size_t sizePredictWorkspace = + _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); + void** predictWorkspaceMP = (void**)_cuann_multi_device_malloc( + numDevices, sizePredictWorkspace, "predictWorkspaceMP"); + + int orgDevId; + cudaGetDevice(&orgDevId); +#pragma omp parallel num_threads(numDevices) + { + int devId = omp_get_thread_num(); + cudaSetDevice(devId); + cudaMemcpy(clusterCentersCopy[devId], + clusterCenters, + sizeof(float) * numCenters * dimCenters, + cudaMemcpyDefault); + uint64_t d0 = (uint64_t)numDataset * (devId) / numDevices; + uint64_t d1 = (uint64_t)numDataset * (devId + 1) / numDevices; + uint64_t nDataset = d1 - d0; + void* ptrDataset; + if (dtype == CUDA_R_32F) { + ptrDataset = (void*)((float*)dataset + (uint64_t)dimCenters * d0); + } else if (dtype == CUDA_R_8U) { + ptrDataset = (void*)((uint8_t*)dataset + (uint64_t)dimCenters * d0); + } else if (dtype == CUDA_R_8I) { + ptrDataset = (void*)((int8_t*)dataset + (uint64_t)dimCenters * d0); + } + _cuann_kmeans_predict(cublasHandles[devId], + clusterCentersCopy[devId], + numCenters, + dimCenters, + ptrDataset, + dtype, + nDataset, + labels + d0, + similarity, + isCenterSet, + predictWorkspaceMP[devId], + clusterCentersMP[devId], + clusterSizeMP[devId], + false /* do not update centers */); + } + for (int devId = 0; devId < numDevices; devId++) { + // Barrier + cudaSetDevice(devId); + cudaDeviceSynchronize(); + } + cudaSetDevice(orgDevId); + if (clusterSize != NULL) { + // Reduce results to main thread + _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + for (int devId = 0; devId < numDevices; devId++) { + _cuann_axpy(numCenters, 1, clusterSizeMP[devId], clusterSize); + if (devId != orgDevId) { + _cuann_axpy( + numCenters * dimCenters, 1, clusterCentersMP[devId], clusterCentersMP[orgDevId]); + } + } + if (updateCenter) { + _cuann_kmeans_update_centers(clusterCenters, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + similarity, + clusterSize, + clusterCentersMP[orgDevId]); + } + } + + _cuann_multi_device_free(clusterCentersCopy, numDevices); + _cuann_multi_device_free(clusterCentersMP, numDevices); + _cuann_multi_device_free(clusterSizeMP, numDevices); + _cuann_multi_device_free((uint8_t**)predictWorkspaceMP, numDevices); +} + +// predict labe of dataset (naive CPU version). +// (*) available only for prediction, but not for training. +void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity) +{ + float multiplier = 1.0; + if (dtype == CUDA_R_8U) { + multiplier = 1.0 / 256.0; + } else if (dtype == CUDA_R_8I) { + multiplier = 1.0 / 128.0; + } + for (uint32_t i = 0; i < numDataset; i++) { + float* vector = (float*)malloc(sizeof(float) * dimCenters); + for (uint32_t j = 0; j < dimCenters; j++) { + if (dtype == CUDA_R_32F) { + vector[j] = ((float*)dataset)[j + (dimCenters * i)]; + } else if (dtype == CUDA_R_8U) { + vector[j] = ((uint8_t*)dataset)[j + (dimCenters * i)]; + vector[j] *= multiplier; + } else if (dtype == CUDA_R_8I) { + vector[j] = ((int8_t*)dataset)[j + (dimCenters * i)]; + vector[j] *= multiplier; + } + } + float best_score; + for (uint32_t l = 0; l < numCenters; l++) { + float score = 0.0; + for (uint32_t j = 0; j < dimCenters; j++) { + if (similarity == CUANN_SIMILARITY_INNER) { + score -= vector[j] * centers[j + (dimCenters * l)]; + } else { + float diff = vector[j] - centers[j + (dimCenters * l)]; + score += diff * diff; + } + } + if ((l == 0) || (score < best_score)) { + labels[i] = l; + best_score = score; + } + } + free(vector); + } +} + +#define R_FACTOR 8 + +// +template +__global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* _dataset, // [numDataet, dimCenters] + uint32_t numDataset, + const uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + const uint32_t* clusterSize, // [numCenters] + float threshold, + uint32_t average, + uint32_t ofst, + uint32_t* count) +{ + const T* dataset = (const T*)_dataset; + float divisor = (float)_divisor; + uint32_t l = threadIdx.y + blockDim.y * blockIdx.y; + if (l >= numCenters) return; + if (clusterSize[l] > (int)(average * threshold)) return; + + uint32_t laneId = threadIdx.x % 32; + uint32_t i; + if (laneId == 0) { + do { + uint32_t old = atomicAdd(count, 1); + i = (ofst * (old + 1)) % numDataset; + } while (clusterSize[labels[i]] < average); + } + i = __shfl_sync(0xffffffff, i, 0); + uint32_t li = labels[i]; + float sqsum = 0.0; + for (uint32_t j = laneId; j < dimCenters; j += 32) { + float val = centers[j + (uint64_t)dimCenters * li] * (R_FACTOR - 1); + val += (float)(dataset[j + (uint64_t)dimCenters * i]) / divisor; + val /= R_FACTOR; + sqsum += val * val; + centers[j + (uint64_t)dimCenters * l] = val; + } + if (similarity == CUANN_SIMILARITY_INNER) { + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); + sqsum = sqrt(sqsum); + for (uint32_t j = laneId; j < dimCenters; j += 32) { + centers[j + ((uint64_t)dimCenters * l)] /= sqsum; + } + } +} + +// adjust centers which have small number of entries +bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + const uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + const uint32_t* clusterSize, // [numCenters] + float threshold, + void* ws) +{ + if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { + fprintf(stderr, "(%s, %d) Unsupported dtype (%d)\n", __func__, __LINE__, dtype); + exit(-1); + } + bool adjusted = false; + static uint32_t iPrimes = 0; + constexpr uint32_t numPrimes = 40; + uint32_t primes[numPrimes] = {29, 71, 113, 173, 229, 281, 349, 409, 463, 541, + 601, 659, 733, 809, 863, 941, 1013, 1069, 1151, 1223, + 1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987, + 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741}; + uint32_t average = (numDataset + numCenters - 1) / numCenters; + uint32_t ofst; + do { + iPrimes = (iPrimes + 1) % numPrimes; + ofst = primes[iPrimes]; + } while (numDataset % ofst == 0); + + cudaDeviceSynchronize(); + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, dataset); + if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { + // GPU + uint32_t* count; + if (ws == NULL) { + cudaMallocManaged(&count, sizeof(uint32_t)); + } else { + count = (uint32_t*)ws; + } + count[0] = 0; + void (*kernel)(float*, + uint32_t, + uint32_t, + const void*, + uint32_t, + const uint32_t*, + cuannSimilarity_t, + const uint32_t*, + float, + uint32_t, + uint32_t, + uint32_t*); + if (dtype == CUDA_R_32F) { + kernel = kern_adjust_centers; + } else if (dtype == CUDA_R_8U) { + kernel = kern_adjust_centers; + } else if (dtype == CUDA_R_8I) { + kernel = kern_adjust_centers; + } + dim3 threads(32, 4, 1); + dim3 blocks(1, (numCenters + threads.y - 1) / threads.y, 1); + kernel<<>>(centers, + numCenters, + dimCenters, + dataset, + numDataset, + labels, + similarity, + clusterSize, + threshold, + average, + ofst, + count); + cudaDeviceSynchronize(); + if (count[0] > 0) { adjusted = true; } + if (ws == NULL) { cudaFree(count); } + } else { + // CPU + uint32_t i = 0; + uint32_t count = 0; + for (uint32_t l = 0; l < numCenters; l++) { + if (clusterSize[l] > (int)(average * threshold)) continue; + do { + i = (i + ofst) % numDataset; + } while (clusterSize[labels[i]] < average); + uint32_t li = labels[i]; + float sqsum = 0.0; + for (uint32_t j = 0; j < dimCenters; j++) { + float val = centers[j + ((uint64_t)dimCenters * li)] * (R_FACTOR - 1); + if (dtype == CUDA_R_32F) { + val += ((float*)dataset)[j + ((uint64_t)dimCenters * i)]; + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + val += ((uint8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor; + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + val += ((int8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor; + } + val /= R_FACTOR; + sqsum += val * val; + centers[j + ((uint64_t)dimCenters * l)] = val; + } + if (similarity == CUANN_SIMILARITY_INNER) { + sqsum = sqrt(sqsum); + for (uint32_t j = 0; j < dimCenters; j++) { + centers[j + ((uint64_t)dimCenters * l)] /= sqsum; + } + } + count += 1; + } + if (count > 0) { + adjusted = true; +#ifdef CUANN_DEBUG + fprintf(stderr, + "(%s) num adjusted: %u / %u, threshold: %d \n", + __func__, + count, + numCenters, + (int)(average * threshold)); +#endif + } + } + return adjusted; +} + +/** + * end of kmeans + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * Start of topk + */ + +// +#define NUM_THREADS 1024 // DO NOT CHANGE +#define STATE_BIT_LENGTH 8 // 0: state not used, 8: state used +#define MAX_VEC_LENGTH 8 // 1, 2, 4 or 8 +// #define CUANN_DEBUG + +// +__device__ inline uint32_t convert(uint32_t x) +{ + if (x & 0x80000000) { + return x ^ 0xffffffff; + } else { + return x ^ 0x80000000; + } +} + +// +struct u32_vector { + uint1 x1; + uint2 x2; + uint4 x4; + ulonglong4 x8; +}; + +// +template +__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) +{ + if (vecLen == 1) { + vec.x1 = ((uint1*)(x + i))[0]; + } else if (vecLen == 2) { + vec.x2 = ((uint2*)(x + i))[0]; + } else if (vecLen == 4) { + vec.x4 = ((uint4*)(x + i))[0]; + } else if (vecLen == 8) { + vec.x8 = ((ulonglong4*)(x + i))[0]; + } +} + +// +template +__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) +{ + uint32_t xi; + if (vecLen == 1) { + xi = convert(vec.x1.x); + } else if (vecLen == 2) { + if (i == 0) + xi = convert(vec.x2.x); + else + xi = convert(vec.x2.y); + } else if (vecLen == 4) { + if (i == 0) + xi = convert(vec.x4.x); + else if (i == 1) + xi = convert(vec.x4.y); + else if (i == 2) + xi = convert(vec.x4.z); + else + xi = convert(vec.x4.w); + } else if (vecLen == 8) { + if (i == 0) + xi = convert((uint32_t)(vec.x8.x & 0xffffffff)); + else if (i == 1) + xi = convert((uint32_t)(vec.x8.x >> 32)); + else if (i == 2) + xi = convert((uint32_t)(vec.x8.y & 0xffffffff)); + else if (i == 3) + xi = convert((uint32_t)(vec.x8.y >> 32)); + else if (i == 4) + xi = convert((uint32_t)(vec.x8.z & 0xffffffff)); + else if (i == 5) + xi = convert((uint32_t)(vec.x8.z >> 32)); + else if (i == 6) + xi = convert((uint32_t)(vec.x8.w & 0xffffffff)); + else + xi = convert((uint32_t)(vec.x8.w >> 32)); + } + return xi; +} + +// +template +__launch_bounds__(1024, 2) __global__ + void kern_topk_cg_11(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch,] + const uint32_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels, // [size_batch, topk,] + uint32_t* _count // [size_batch, 5 * 1024,] + ) +{ + __shared__ uint32_t smem[2048 + 6]; + uint32_t* best_index = &(smem[2048]); + uint32_t* best_csum = &(smem[2048 + 3]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x * gridDim.x; + uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); + + const uint32_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 6) { smem[2048 + threadIdx.x] = 0; } + + uint32_t* count = _count + (5 * 1024 * i_batch); + for (int i = thread_id; i < 5 * 1024; i += num_threads) { + count[i] = 0; + } + cg::sync(grid); + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (21 - 11 * j); + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 2048) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 2048) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } + } + cg::sync(grid); + + constexpr int n_data = 2048 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] >= topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + } + + { + uint32_t j = 2; + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold); // 0 <= k + if (k >= 1024) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 1024) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } + } + cg::sync(grid); + + constexpr int n_data = 1024 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] >= topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += best_index[j]; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector vec; + load_u32_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[2048] < topk)) { + if (count_below + atomicAdd(&count[2048], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } + +#ifdef CUANN_DEBUG + cg::sync(grid); + if (thread_id == 0 && count[0] < topk) { + printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", + i_batch, + topk, + count[0], + count_below, + threshold); + } +#endif +} + +// +template +__launch_bounds__(1024, 2) __global__ + void kern_topk_cta_11(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch, max_len_x,] + const uint32_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels // [size_batch, topk,] + ) +{ + __shared__ uint32_t smem[2048 + 3 + 3 + 2]; + uint32_t* best_index = &(smem[2048]); + uint32_t* best_csum = &(smem[2048 + 3]); + uint32_t* count = &(smem[2048 + 6]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x; + uint32_t thread_id = threadIdx.x; + + const uint32_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 8) { smem[2048 + threadIdx.x] = 0; } + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (21 - 11 * j); + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 2048) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 2048) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + constexpr int n_data = 2048 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = smem[i + (n_data * threadIdx.x)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] > topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + if (count_below == topk) break; + } + + { + uint32_t j = 2; + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold); // 0 <= k + if (k >= 1024) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 1024) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + constexpr int n_data = 1024 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = smem[i + (n_data * threadIdx.x)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] > topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += best_index[j]; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector vec; + load_u32_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[1] < topk)) { + if (count_below + atomicAdd(&count[1], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } + +#ifdef CUANN_DEBUG + __syncthreads(); + if (thread_id == 0 && count[0] < topk) { + printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", + i_batch, + topk, + count[0], + count_below, + threshold); + } +#endif +} + +// +__device__ inline uint16_t convert(uint16_t x) +{ + if (x & 0x8000) { + return x ^ 0xffff; + } else { + return x ^ 0x8000; + } +} + +// +struct u16_vector { + ushort1 x1; + ushort2 x2; + ushort4 x4; + uint4 x8; +}; + +// +template +__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) +{ + if (vecLen == 1) { + vec.x1 = ((ushort1*)(x + i))[0]; + } else if (vecLen == 2) { + vec.x2 = ((ushort2*)(x + i))[0]; + } else if (vecLen == 4) { + vec.x4 = ((ushort4*)(x + i))[0]; + } else if (vecLen == 8) { + vec.x8 = ((uint4*)(x + i))[0]; + } +} + +// +template +__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) +{ + uint16_t xi; + if (vecLen == 1) { + xi = convert(vec.x1.x); + } else if (vecLen == 2) { + if (i == 0) + xi = convert(vec.x2.x); + else + xi = convert(vec.x2.y); + } else if (vecLen == 4) { + if (i == 0) + xi = convert(vec.x4.x); + else if (i == 1) + xi = convert(vec.x4.y); + else if (i == 2) + xi = convert(vec.x4.z); + else + xi = convert(vec.x4.w); + } else if (vecLen == 8) { + if (i == 0) + xi = convert((uint16_t)(vec.x8.x & 0xffff)); + else if (i == 1) + xi = convert((uint16_t)(vec.x8.x >> 16)); + else if (i == 2) + xi = convert((uint16_t)(vec.x8.y & 0xffff)); + else if (i == 3) + xi = convert((uint16_t)(vec.x8.y >> 16)); + else if (i == 4) + xi = convert((uint16_t)(vec.x8.z & 0xffff)); + else if (i == 5) + xi = convert((uint16_t)(vec.x8.z >> 16)); + else if (i == 6) + xi = convert((uint16_t)(vec.x8.w & 0xffff)); + else + xi = convert((uint16_t)(vec.x8.w >> 16)); + } + return xi; +} + +// +template +__launch_bounds__(1024, 2) __global__ + void kern_topk_cg_8(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch,] + const uint16_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels, // [size_batch, topk,] + uint32_t* _count // [size_batch, 5 * 1024,] + ) +{ + __shared__ uint32_t smem[256 + 4]; + uint32_t* best_index = &(smem[256]); + uint32_t* best_csum = &(smem[256 + 2]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x * gridDim.x; + uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); + + const uint16_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 4) { smem[256 + threadIdx.x] = 0; } + + uint32_t* count = _count + (2 * 256 * i_batch); + for (int i = thread_id; i < 2 * 256; i += num_threads) { + count[i] = 0; + } + cg::sync(grid); + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (8 - 8 * j); + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector x_vec; + load_u16_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u16_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 256) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 256) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (256 * j)]), smem[i]); } + } + cg::sync(grid); + + uint32_t csum[1]; + csum[0] = 0; + if (threadIdx.x < 256) { csum[0] = count[threadIdx.x + (256 * j)]; } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + + if (threadIdx.x < 256) { + if (count_below + csum[0] < topk) { + uint32_t index = threadIdx.x; + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[0]); + } + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector vec; + load_u16_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u16_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[256] < topk)) { + if (count_below + atomicAdd(&count[256], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } + +#ifdef CUANN_DEBUG + cg::sync(grid); + if (thread_id == 0 && count[0] < topk) { + printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", + i_batch, + topk, + count[0], + count_below, + threshold); + } +#endif +} + +// +template +__launch_bounds__(1024, 2) __global__ + void kern_topk_cta_8(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch, max_len_x,] + const uint16_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels // [size_batch, topk,] + ) +{ + __shared__ uint32_t smem[256 + 6]; + uint32_t* best_index = &(smem[256]); + uint32_t* best_csum = &(smem[256 + 2]); + uint32_t* count = &(smem[256 + 4]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x; + uint32_t thread_id = threadIdx.x; + + const uint16_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 6) { smem[256 + threadIdx.x] = 0; } + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (8 - 8 * j); + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector x_vec; + load_u16_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u16_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 256) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 256) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + uint32_t csum[1]; + if (threadIdx.x < 256) { csum[0] = smem[threadIdx.x]; } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + + if (threadIdx.x < 256) { + if (count_below + csum[0] < topk) { + uint32_t index = threadIdx.x; + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[0]); + } + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + if (count_below == topk) break; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector vec; + load_u16_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u16_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[1] < topk)) { + if (count_below + atomicAdd(&count[1], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } + +#ifdef CUANN_DEBUG + __syncthreads(); + if (thread_id == 0 && count[0] < topk) { + printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", + i_batch, + topk, + count[0], + count_below, + threshold); + } +#endif +} + +// +__global__ void _sort_topk_prep(uint32_t sizeBatch, + uint32_t topK, + uint32_t maxSamples, + const uint32_t* labels, // [sizeBatch, topK] + const float* samples, // [sizeBatch, maxSamples] + int* offsets, // [sizeBatch + 1] + float* outputs // [sizeBatch, topK] +) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid < sizeBatch + 1) { offsets[tid] = tid * topK; } + if (tid < sizeBatch * topK) { + uint32_t label = labels[tid]; + uint32_t iBatch = tid / topK; + float value = samples[label + (maxSamples * iBatch)]; + outputs[tid] = value; + } +} + +// +size_t _cuann_find_topk_bufferSize(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + cudaDataType_t sampleDtype = CUDA_R_32F) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); + + size_t workspaceSize = 0; + // count + if (sampleDtype == CUDA_R_16F) { + workspaceSize += _cuann_aligned(sizeof(uint32_t) * sizeBatch * 2 * 256); + } else { + workspaceSize += _cuann_aligned(sizeof(uint32_t) * sizeBatch * 5 * 1024); + } + // state + if (stateBitLen == 8) { + // (*) Each thread has at least one array element for state + uint32_t numBlocks_perBatch = + ((handle->deviceProp).multiProcessorCount * 2 + sizeBatch) / sizeBatch; + uint32_t numThreads_perBatch = numThreads * numBlocks_perBatch; + uint32_t numSample_perThread = (maxSamples + numThreads_perBatch - 1) / numThreads_perBatch; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + workspaceSize += + _cuann_aligned(sizeof(uint8_t) * numState_perThread * numThreads_perBatch * sizeBatch); + } + + size_t workspaceSize2 = 0; + // offsets + workspaceSize2 += _cuann_aligned(sizeof(int) * (sizeBatch + 1)); + // keys_in, keys_out, values_out + workspaceSize2 += _cuann_aligned(sizeof(float) * sizeBatch * topK); + workspaceSize2 += _cuann_aligned(sizeof(float) * sizeBatch * topK); + workspaceSize2 += _cuann_aligned(sizeof(uint32_t) * sizeBatch * topK); + // cub_ws + size_t cub_ws_size = 0; + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + cub_ws_size, + (float*)NULL, + (float*)NULL, + (uint32_t*)NULL, + (uint32_t*)NULL, + sizeBatch * topK, + sizeBatch, + (int*)NULL, + (int*)NULL); + workspaceSize2 += _cuann_aligned(cub_ws_size); + workspaceSize = max(workspaceSize, workspaceSize2); + + return workspaceSize; +} + +// +int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH) +{ + int vecLen = min(maxVecLen, MAX_VEC_LENGTH); + while ((maxSamples % vecLen) != 0) { + vecLen /= 2; + } + return vecLen; +} + +// +void _cuann_find_topk(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const float* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); +#ifdef CUANN_DEBUG + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle->stream); +#endif + + // Limit the maximum value of vecLen to 4. In the case of FP32, + // setting vecLen = 8 in cg_kernel causes too much register usage. + int vecLen = _get_vecLen(maxSamples, 4); + void* cg_kernel; + if (vecLen == 4) { + cg_kernel = (void*)kern_topk_cg_11; + } else if (vecLen == 2) { + cg_kernel = (void*)kern_topk_cg_11; + } else if (vecLen == 1) { + cg_kernel = (void*)kern_topk_cg_11; + } + + int numBlocksPerSm_topk; + size_t dynamicSMemSize = 0; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize); + int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); + int numBlocks = min(numBlocks_perBatch * sizeBatch, + (handle->deviceProp).multiProcessorCount * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + if (maxSamples <= numThreads * 10) { + // When number of sample is small, using multiple thread-blocks does not + // improve performance, in which case cta_kernel is used. Tentatively, + // "numThreads * 10" is used as the threshold, but this may be better + // determined by auto-tuning, etc. + numBlocks_perBatch = 1; + } + uint32_t* count = (uint32_t*)workspace; + uint8_t* state = NULL; + if (stateBitLen == 8) { + state = (uint8_t*)count + _cuann_aligned(sizeof(uint32_t) * sizeBatch * 5 * 1024); + } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(numBlocks_perBatch, sizeBatch, 1); + if (numBlocks_perBatch <= 1) { + void (*cta_kernel)( + uint32_t, uint32_t, uint32_t, uint32_t*, const uint32_t*, uint8_t*, uint32_t*); + int vecLen = _get_vecLen(maxSamples); + if (vecLen == 8) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 4) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 2) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 1) { + cta_kernel = kern_topk_cta_11; + } + cta_kernel<<stream>>>( + topK, sizeBatch, maxSamples, numSamples, (const uint32_t*)samples, state, labels); + } else { + void* args[9]; + args[0] = {&(topK)}; + args[1] = {&(sizeBatch)}; + args[2] = {&(maxSamples)}; + args[3] = {&(numSamples)}; + args[4] = {&(samples)}; + args[5] = {&(state)}; + args[6] = {&(labels)}; + args[7] = {&(count)}; + args[8] = {nullptr}; + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle->stream); + } + if (!sort) { return; } + + // offsets: [sizeBatch + 1] + // keys_in, keys_out, values_out: [sizeBatch, topK] + int* offsets = (int*)workspace; + float* keys_in = (float*)((uint8_t*)offsets + _cuann_aligned(sizeof(int) * (sizeBatch + 1))); + float* keys_out = (float*)((uint8_t*)keys_in + _cuann_aligned(sizeof(float) * sizeBatch * topK)); + uint32_t* values_out = + (uint32_t*)((uint8_t*)keys_out + _cuann_aligned(sizeof(float) * sizeBatch * topK)); + void* cub_ws = + (void*)((uint8_t*)values_out + _cuann_aligned(sizeof(uint32_t) * sizeBatch * topK)); + + dim3 stpThreads(128, 1, 1); + dim3 stpBlocks((max(sizeBatch + 1, sizeBatch * topK) + stpThreads.x - 1) / stpThreads.x, 1, 1); + _sort_topk_prep<<stream>>>( + sizeBatch, topK, maxSamples, labels, samples, offsets, keys_in); + + size_t cub_ws_size = 0; + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + cub_ws_size, + keys_in, + keys_out, + labels, + values_out, + sizeBatch * topK, + sizeBatch, + offsets, + offsets + 1); + + cub::DeviceSegmentedRadixSort::SortPairs(cub_ws, + cub_ws_size, + keys_in, + keys_out, + labels, + values_out, + sizeBatch * topK, + sizeBatch, + offsets, + offsets + 1, + (int)0, + (int)(sizeof(float) * 8), + handle->stream); + + cudaMemcpyAsync(labels, + values_out, + sizeof(uint32_t) * sizeBatch * topK, + cudaMemcpyDeviceToDevice, + handle->stream); +} + +// +void _cuann_find_topk(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const half* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); +#ifdef CUANN_DEBUG + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle->stream); +#endif + + int vecLen = _get_vecLen(maxSamples); + void* cg_kernel; + if (vecLen == 8) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 4) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 2) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 1) { + cg_kernel = (void*)kern_topk_cg_8; + } + + int numBlocksPerSm_topk; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0); + int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); + int numBlocks = min(numBlocks_perBatch * sizeBatch, + (handle->deviceProp).multiProcessorCount * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + if (maxSamples <= numThreads * 10) { + // When number of sample is small, using multiple thread-blocks does not + // improve performance, in which case cta_kernel is used. Tentatively, + // "numThreads * 10" is used as the threshold, but this may be better + // determined by auto-tuning, etc. + numBlocks_perBatch = 1; + } + uint32_t* count = (uint32_t*)workspace; + uint8_t* state = NULL; + if (stateBitLen == 8) { + state = (uint8_t*)count + _cuann_aligned(sizeof(uint32_t) * sizeBatch * 2 * 256); + } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(numBlocks_perBatch, sizeBatch, 1); + if (numBlocks_perBatch <= 1) { + void (*cta_kernel)( + uint32_t, uint32_t, uint32_t, uint32_t*, const uint16_t*, uint8_t*, uint32_t*); + int vecLen = _get_vecLen(maxSamples); + if (vecLen == 8) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 4) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 2) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 1) { + cta_kernel = kern_topk_cta_8; + } + cta_kernel<<stream>>>( + topK, sizeBatch, maxSamples, numSamples, (const uint16_t*)samples, state, labels); + } else { + void* args[9]; + args[0] = {&(topK)}; + args[1] = {&(sizeBatch)}; + args[2] = {&(maxSamples)}; + args[3] = {&(numSamples)}; + args[4] = {&(samples)}; + args[5] = {&(state)}; + args[6] = {&(labels)}; + args[7] = {&(count)}; + args[8] = {nullptr}; + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle->stream); + } +} + +/** + * + * End of topk + * + * + * + * + * + * + * + * + * + * + * Start of ivfpq + */ + +// +size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc); + +// search +template +void ivfpq_search(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimDataset] + const float* pqCenters, // [dimPq, 256, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [dimDataset] + uint64_t* topKNeighbors, // [topK] + float* topKDistances, // [topK] + void* workspace); + +void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +); + +// +bool manage_local_topk(cuannIvfPqDescriptor_t desc); +size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads); + +// +__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] + float initValue, + uint32_t num); + +// +__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList); + +// +__global__ void ivfpq_make_chunk_index_ptr( + uint32_t numProbes, + uint32_t sizeBatch, + const uint32_t* indexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] + uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] + uint32_t* numSamples // [sizeBatch,] +); + +// +template +__global__ void ivfpq_make_outputs(uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const scoreDtype* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +// +__device__ inline uint32_t warp_scan(uint32_t x) +{ + uint32_t y; + y = __shfl_up_sync(0xffffffff, x, 1); + if (threadIdx.x % 32 >= 1) x += y; + y = __shfl_up_sync(0xffffffff, x, 2); + if (threadIdx.x % 32 >= 2) x += y; + y = __shfl_up_sync(0xffffffff, x, 4); + if (threadIdx.x % 32 >= 4) x += y; + y = __shfl_up_sync(0xffffffff, x, 8); + if (threadIdx.x % 32 >= 8) x += y; + y = __shfl_up_sync(0xffffffff, x, 16); + if (threadIdx.x % 32 >= 16) x += y; + return x; +} + +// +__device__ inline uint32_t thread_block_scan(uint32_t x, uint32_t* smem) +{ + x = warp_scan(x); + __syncthreads(); + if (threadIdx.x % 32 == 31) { smem[threadIdx.x / 32] = x; } + __syncthreads(); + if (threadIdx.x < 32) { smem[threadIdx.x] = warp_scan(smem[threadIdx.x]); } + __syncthreads(); + if (threadIdx.x / 32 > 0) { x += smem[threadIdx.x / 32 - 1]; } + __syncthreads(); + return x; +} + +// +__global__ void ivfpq_make_chunk_index_ptr( + uint32_t numProbes, + uint32_t sizeBatch, + const uint32_t* indexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] + uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] + uint32_t* numSamples // [sizeBatch,] +) +{ + __shared__ uint32_t smem_temp[32]; + __shared__ uint32_t smem_base[2]; + + uint32_t iBatch = blockIdx.x; + if (iBatch >= sizeBatch) return; + const uint32_t* clusterLabelsToProbe = _clusterLabelsToProbe + (numProbes * iBatch); + uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + + // + uint32_t j_end = (numProbes + 1024 - 1) / 1024; + for (uint32_t j = 0; j < j_end; j++) { + uint32_t i = threadIdx.x + (1024 * j); + uint32_t val = 0; + if (i < numProbes) { + uint32_t l = clusterLabelsToProbe[i]; + val = indexPtr[l + 1] - indexPtr[l]; + } + val = thread_block_scan(val, smem_temp); + + if (i < numProbes) { + if (j > 0) { val += smem_base[(j - 1) & 0x1]; } + chunkIndexPtr[i] = val; + if (i == numProbes - 1) { numSamples[iBatch] = val; } + } + + if ((j < j_end - 1) && (threadIdx.x == 1023)) { smem_base[j & 0x1] = val; } + } +} + +// +__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] + float initValue, + uint32_t num) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= num) return; + topkScores[i] = initValue; +} + +// +__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numElement) return; + indexList[i] = i; +} + +// +__device__ inline void ivfpq_get_id_dataset(uint32_t iSample, + uint32_t numProbes, + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* clusterLabels, // [numProbes,] + const uint32_t* chunkIndexPtr, // [numProbes,] + uint32_t& iChunk, + uint32_t& label, + uint32_t& iDataset) +{ + uint32_t minChunk = 0; + uint32_t maxChunk = numProbes - 1; + iChunk = (minChunk + maxChunk) / 2; + while (minChunk < maxChunk) { + if (iSample >= chunkIndexPtr[iChunk]) { + minChunk = iChunk + 1; + } else { + maxChunk = iChunk; + } + iChunk = (minChunk + maxChunk) / 2; + } + + label = clusterLabels[iChunk]; + uint32_t iSampleInChunk = iSample; + if (iChunk > 0) { iSampleInChunk -= chunkIndexPtr[iChunk - 1]; } + iDataset = iSampleInChunk + clusterIndexPtr[label]; +} + +// +template +__global__ void ivfpq_make_outputs(uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const scoreDtype* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= topk) return; + uint32_t iBatch = blockIdx.y; + if (iBatch >= sizeBatch) return; + + uint32_t iSample = topkSampleIds[i + (topk * iBatch)]; + if (scoreTopkIndex == NULL) { + // 0 <= iSample < maxSamples + topkScores[i + (topk * iBatch)] = scores[iSample + (maxSamples * iBatch)]; + uint32_t iChunk; + uint32_t label; + uint32_t iDataset; + ivfpq_get_id_dataset(iSample, + numProbes, + clusterIndexPtr, + clusterLabels + (numProbes * iBatch), + chunkIndexPtr + (numProbes * iBatch), + iChunk, + label, + iDataset); + topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; + } else { + // 0 <= iSample < (numProbes * topk) + topkScores[i + (topk * iBatch)] = scores[iSample + ((numProbes * topk) * iBatch)]; + uint32_t iDataset = scoreTopkIndex[iSample + ((numProbes * topk) * iBatch)]; + topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; + } +} + +// +bool manage_local_topk(cuannIvfPqDescriptor_t desc) +{ + int depth = (desc->topK + 31) / 32; + if (depth > 4) { return false; } + if (desc->numProbes < 16) { return false; } + if (desc->maxBatchSize * desc->numProbes < 256) { return false; } + return true; +} + +// +size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads) +{ + if (manage_local_topk(desc)) { + int topk_32 = (desc->topK + 31) / 32; + return (sizeof(float) + sizeof(uint32_t)) * (numThreads / 2) * topk_32; + } + return 0; +} + +// return workspace size +size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc) +{ + size_t size = 0; + // clusterLabelsOut [maxBatchSize, numProbes] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // indexList [maxBatchSize * numProbes] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // indexListSorted [maxBatchSize * numProbes] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // numSamples [maxBatchSize,] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize); + // cubWorkspace + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + uint32_t* d_keys_in = NULL; + uint32_t* d_keys_out = NULL; + uint32_t* d_values_in = NULL; + uint32_t* d_values_out = NULL; + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + desc->maxBatchSize * desc->numProbes); + desc->sizeCubWorkspace = _cuann_aligned(temp_storage_bytes); + size += desc->sizeCubWorkspace; + // chunkIndexPtr [maxBatchSize, numProbes] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // topkSids [maxBatchSize, topk] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->topK); + // similarity + size_t unit_size = sizeof(float); + if (desc->internalDistanceDtype == CUDA_R_16F) { unit_size = sizeof(half); } + if (manage_local_topk(desc)) { + // [matBatchSize, numProbes, topK] + size += _cuann_aligned(unit_size * desc->maxBatchSize * desc->numProbes * desc->topK); + } else { + // [matBatchSize, maxSamples] + size += _cuann_aligned(unit_size * desc->maxBatchSize * desc->maxSamples); + } + // simTopkIndex + if (manage_local_topk(desc)) { + // [matBatchSize, numProbes, topk] + size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); + } + // topkScores + if (manage_local_topk(desc)) { + // [maxBatchSize, topk] + size += _cuann_aligned(sizeof(float) * desc->maxBatchSize * desc->topK); + } + // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] + size += _cuann_aligned(sizeof(float) * (handle->deviceProp).multiProcessorCount * desc->dimPq * + (1 << desc->bitPq)); + // topkWorkspace + if (manage_local_topk(desc)) { + size += _cuann_find_topk_bufferSize(handle, + desc->topK, + desc->maxBatchSize, + desc->numProbes * desc->topK, + desc->internalDistanceDtype); + } else { + size += _cuann_find_topk_bufferSize( + handle, desc->topK, desc->maxBatchSize, desc->maxSamples, desc->internalDistanceDtype); + } + return size; +} + +// +__device__ __host__ inline void ivfpq_encode_core( + uint32_t ldDataset, uint32_t dimPq, uint32_t bitPq, const uint32_t* label, uint8_t* output) +{ + for (uint32_t j = 0; j < dimPq; j++) { + uint8_t code = label[(ldDataset * j)]; + if (bitPq == 8) { + uint8_t* ptrOutput = output + j; + ptrOutput[0] = code; + } else if (bitPq == 7) { + uint8_t* ptrOutput = output + 7 * (j / 8); + if (j % 8 == 0) { + ptrOutput[0] |= code; + } else if (j % 8 == 1) { + ptrOutput[0] |= code << 7; + ptrOutput[1] |= code >> 1; + } else if (j % 8 == 2) { + ptrOutput[1] |= code << 6; + ptrOutput[2] |= code >> 2; + } else if (j % 8 == 3) { + ptrOutput[2] |= code << 5; + ptrOutput[3] |= code >> 3; + } else if (j % 8 == 4) { + ptrOutput[3] |= code << 4; + ptrOutput[4] |= code >> 4; + } else if (j % 8 == 5) { + ptrOutput[4] |= code << 3; + ptrOutput[5] |= code >> 5; + } else if (j % 8 == 6) { + ptrOutput[5] |= code << 2; + ptrOutput[6] |= code >> 6; + } else if (j % 8 == 7) { + ptrOutput[6] |= code << 1; + } + } else if (bitPq == 6) { + uint8_t* ptrOutput = output + 3 * (j / 4); + if (j % 4 == 0) { + ptrOutput[0] |= code; + } else if (j % 4 == 1) { + ptrOutput[0] |= code << 6; + ptrOutput[1] |= code >> 2; + } else if (j % 4 == 2) { + ptrOutput[1] |= code << 4; + ptrOutput[2] |= code >> 4; + } else if (j % 4 == 3) { + ptrOutput[2] |= code << 2; + } + } else if (bitPq == 5) { + uint8_t* ptrOutput = output + 5 * (j / 8); + if (j % 8 == 0) { + ptrOutput[0] |= code; + } else if (j % 8 == 1) { + ptrOutput[0] |= code << 5; + ptrOutput[1] |= code >> 3; + } else if (j % 8 == 2) { + ptrOutput[1] |= code << 2; + } else if (j % 8 == 3) { + ptrOutput[1] |= code << 7; + ptrOutput[2] |= code >> 1; + } else if (j % 8 == 4) { + ptrOutput[2] |= code << 4; + ptrOutput[3] |= code >> 4; + } else if (j % 8 == 5) { + ptrOutput[3] |= code << 1; + } else if (j % 8 == 6) { + ptrOutput[3] |= code << 6; + ptrOutput[4] |= code >> 2; + } else if (j % 8 == 7) { + ptrOutput[4] |= code << 3; + } + } else if (bitPq == 4) { + uint8_t* ptrOutput = output + (j / 2); + if (j % 2 == 0) { + ptrOutput[0] |= code; + } else { + ptrOutput[0] |= code << 4; + } + } + } +} + +// +__global__ void ivfpq_encode_kernel(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numDataset) return; + ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); +} + +// +void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +) +{ +#if 1 + // GPU + dim3 iekThreads(128, 1, 1); + dim3 iekBlocks((numDataset + iekThreads.x - 1) / iekThreads.x, 1, 1); + ivfpq_encode_kernel<<>>( + numDataset, ldDataset, dimPq, bitPq, label, output); +#else + // CPU + cudaDeviceSynchronize(); + for (uint32_t i = 0; i < numDataset; i++) { + ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); + } +#endif +} + +// +template __global__ void ivfpq_make_outputs( + uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const float* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +// +template __global__ void ivfpq_make_outputs( + uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const half* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +/** + * End of ivfpq + * + * + * + * + */ + +cuannStatus_t cuannCreate(cuannHandle_t* handle); +cuannStatus_t cuannDestroy(cuannHandle_t handle); +cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream); +cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId); + +cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); +cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); + +cuannStatus_t cuannIvfPqSetIndexParameters( + cuannIvfPqDescriptor_t desc, + const uint32_t numClusters, /* Number of clusters */ + const uint32_t numDataset, /* Number of dataset entries */ + const uint32_t dimDataset, /* Dimension of each entry */ + const uint32_t dimPq, /* Dimension of each entry after product quantization */ + const uint32_t bitPq, /* Bit length of PQ */ + const cuannSimilarity_t similarity, + const cuannPqCenter_t typePqCenter); + +cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter); + +cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, + size_t* size /* bytes of dataset index */); + +cuannStatus_t cuannIvfPqBuildIndex( + cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* dataset, /* [numDataset, dimDataset] */ + const void* trainset, /* [numTrainset, dimDataset] */ + cudaDataType_t dtype, + uint32_t numTrainset, /* Number of train-set entries */ + uint32_t numIterations, /* Number of iterations to train kmeans */ + bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ + bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ + void* index /* database index to build */); + +cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName); + +cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName); + +cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( + cuannHandle_t handle, + const char* oldIndexFileName, + const char* newIndexFileName, + const void* newVectors, /* [numVectorsToAdd, dimDataset] */ + uint32_t numNewVectors); + +cuannStatus_t cuannIvfPqSetSearchParameters( + cuannIvfPqDescriptor_t desc, + const uint32_t numProbes, /* Number of clusters to probe */ + const uint32_t topK); /* Number of search results */ + +cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize); + +cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK); + +cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize); + +cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t numQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize); + +cuannStatus_t cuannIvfPqSearch(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const void* queries, /* [numQueries, dimDataset] */ + cudaDataType_t dtype, + uint32_t numQueries, + uint64_t* neighbors, /* [numQueries, topK] */ + float* distances, /* [numQueries, topK] */ + void* workspace); + +cuannStatus_t cuannPostprocessingRefine(uint32_t numDataset, + uint32_t numQueries, + uint32_t dimDataset, + const void* dataset, /* [numDataset, dimDataset] */ + const void* queries, /* [numQueries, dimDataset] */ + cudaDataType_t dtype, + cuannSimilarity_t similarity, + uint32_t topK, + const uint64_t* neighbors, /* [numQueries, topK] */ + uint32_t refinedTopK, + uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ + float* refinedDistances /* [numQueries, refinedTopK] */ +); + +cuannStatus_t cuannPostprocessingMerge( + uint32_t numSplit, + uint32_t numQueries, + uint32_t topK, + const uint32_t* eachNumDataset, /* [numSplit] */ + const uint64_t* eachNeighbors, /* [numSplit, numQueries, topK] */ + const float* eachDistances, /* [numSplit, numQueries, topK] */ + uint64_t* neighbors, /* [numQueries, topK] */ + float* distances /* [numQueries, topK] */ +); + +size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t desc) +{ + // [numClusters, dimDatasetExt] + return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimDatasetExt); +} + +size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) +{ + size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; + if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + // [dimPq, 1 << bitPq, lenPq] + return _cuann_aligned(desc->dimPq * size_base); + } else { + // [numClusters, 1 << bitPq, lenPq] + return _cuann_aligned(desc->numClusters * size_base); + } +} + +size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t desc) +{ + // [numDataset, dimPq * bitPq / 8] + return _cuann_aligned(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); +} + +size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t desc) +{ + // [numDataset,] + return _cuann_aligned(sizeof(uint32_t) * desc->numDataset); +} + +size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t desc) +{ + // [numClusters + 1,] + return _cuann_aligned(sizeof(uint32_t) * (desc->numClusters + 1)); +} + +size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t desc) +{ + // [dimDataset, dimRotDataset] + return _cuann_aligned(sizeof(float) * desc->dimDataset * desc->dimRotDataset); +} + +size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t desc) +{ + // [numClusters, dimRotDataset] + return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimRotDataset); +} + +void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, + const void* index, + struct cuannIvfPqIndexHeader** header, + float** clusterCenters, // [numClusters, dimDatasetExt] + float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] + uint32_t** originalNumbers, // [numDataset] + uint32_t** indexPtr, // [numClusters + 1] + float** rotationMatrix, // [dimDataset, dimRotDataset] + float** clusterRotCenters // [numClusters, dimRotDataset] +) +{ + *header = (struct cuannIvfPqIndexHeader*)index; + *clusterCenters = (float*)((uint8_t*)(*header) + sizeof(struct cuannIvfPqIndexHeader)); + *pqCenters = (float*)((uint8_t*)(*clusterCenters) + _cuann_getIndexSize_clusterCenters(desc)); + *pqDataset = (uint8_t*)((uint8_t*)(*pqCenters) + _cuann_getIndexSize_pqCenters(desc)); + *originalNumbers = (uint32_t*)((uint8_t*)(*pqDataset) + _cuann_getIndexSize_pqDataset(desc)); + *indexPtr = (uint32_t*)((uint8_t*)(*originalNumbers) + _cuann_getIndexSize_originalNumbers(desc)); + *rotationMatrix = (float*)((uint8_t*)(*indexPtr) + _cuann_getIndexSize_indexPtr(desc)); + *clusterRotCenters = + (float*)((uint8_t*)(*rotationMatrix) + _cuann_getIndexSize_rotationMatrix(desc)); +} + +__global__ void kern_get_cluster_size(uint32_t numClusters, + const uint32_t* indexPtr, // [numClusters + 1,] + uint32_t* clusterSize // [numClusters,] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numClusters) return; + clusterSize[i] = indexPtr[i + 1] - indexPtr[i]; +} + +template +int descending(const void* a, const void* b) +{ + T valA = ((T*)a)[0]; + T valB = ((T*)b)[0]; + if (valA > valB) return -1; + if (valA < valB) return 1; + return 0; +} + +// (*) This is temporal. Need to be removed in future. +void _cuann_get_random_norm_vector(int len, float* vector) +{ + float sqsum = 0.0; + for (int i = 0; i < len; i++) { + vector[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; + sqsum += vector[i] * vector[i]; + } + float norm = sqrt(sqsum); + for (int i = 0; i < len; i++) { + vector[i] /= norm; + } +} + +void _cuann_get_inclusiveSumSortedClusterSize( + cuannIvfPqDescriptor_t desc, + const uint32_t* indexPtr, // [numClusters + 1] + float* clusterCenters, // [numClusters, dimDatasetExt] + uint32_t** output // [numClusters] +) +{ + // [CPU] + *output = (uint32_t*)malloc(sizeof(uint32_t) * desc->numClusters); + desc->_numClustersSize0 = 0; + for (int i = 0; i < desc->numClusters; i++) { + (*output)[i] = indexPtr[i + 1] - indexPtr[i]; + if ((*output)[i] > 0) continue; + + desc->_numClustersSize0 += 1; + // Work-around for clusters of size 0 +#if 0 + printf("# i:%d, %u ... ", i, (*output)[i]); + for (int j = 0; j < desc->dimDatasetExt; j++) { + printf( "%.3f, ", clusterCenters[ j + (desc->dimDatasetExt * i) ] ); + } + printf( "\n" ); +#endif + _cuann_get_random_norm_vector(desc->dimDatasetExt, clusterCenters + (desc->dimDatasetExt * i)); +#if 0 + printf("# i:%d, %u ... ", i, (*output)[i]); + for (int j = 0; j < desc->dimDatasetExt; j++) { + printf( "%.3f, ", clusterCenters[ j + (desc->dimDatasetExt * i) ] ); + } + printf( "\n" ); +#endif + } + if (1 || desc->_numClustersSize0 > 0) { + fprintf(stderr, "# num clusters of size 0: %d\n", desc->_numClustersSize0); + } + // sort + qsort(*output, desc->numClusters, sizeof(uint32_t), descending); + // scan + for (int i = 1; i < desc->numClusters; i++) { + (*output)[i] += (*output)[i - 1]; + } + assert((*output)[desc->numClusters - 1] == desc->numDataset); +} + +void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, + const float* clusterCenters, // [numClusters, dimDataset,] + float** output // [numClusters,] +) +{ + cudaError_t cudaError; + if (*output != NULL) { cudaFree(*output); } + cudaError = cudaMallocManaged(output, sizeof(float) * desc->numClusters); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + exit(-1); + } + _cuann_sqsum(desc->numClusters, desc->dimDataset, clusterCenters, *output); +} + +// +template +T _cuann_dot(int n, const T* x, int incX, const T* y, int incY) +{ + T val = 0; + for (int i = 0; i < n; i++) { + val += x[incX * i] * y[incY * i]; + } + return val; +} + +// +template +T _cuann_dot(int n, const X* x, int incX, const Y* y, int incY, T divisor = 1) +{ + T val = 0; + for (int i = 0; i < n; i++) { + val += (T)(x[incX * i]) * (T)(y[incY * i]) / divisor; + } + return val; +} + +// +template +T _cuann_rand() +{ + return (T)rand() / RAND_MAX; +} + +// make rotation matrix +void _cuann_make_rotation_matrix(uint32_t nRows, + uint32_t nCols, + uint32_t lenPq, + bool randomRotation, + float* rotationMatrix // [nRows, nCols] +) +{ + assert(nRows >= nCols); + assert(nRows % lenPq == 0); + + if (randomRotation) { + fprintf(stderr, "# create rotation matrix randomly.\n"); + double dot, norm; + double* matrix = (double*)malloc(sizeof(double) * nRows * nCols); + memset(matrix, 0, sizeof(double) * nRows * nCols); + for (int i = 0; i < nRows * nCols; i++) { + matrix[i] = _cuann_rand() - 0.5; + } + for (int j = 0; j < nCols; j++) { + // normalize the j-th col vector + norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); + for (int i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] /= norm; + } + // orthogonalize the j-th col vector with the previous col vectors + for (int k = 0; k < j; k++) { + dot = _cuann_dot(nRows, matrix + j, nCols, matrix + k, nCols); + for (int i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] -= dot * matrix[k + (nCols * i)]; + } + } + // normalize the j-th col vector again + norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); + for (int i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] /= norm; + } + } + for (int i = 0; i < nRows * nCols; i++) { + rotationMatrix[i] = (float)matrix[i]; + } + free(matrix); + } else { + if (nRows == nCols) { + memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); + for (int i = 0; i < nCols; i++) { + rotationMatrix[i + (nCols * i)] = 1.0; + } + } else { + memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); + int i = 0; + for (int j = 0; j < nCols; j++) { + rotationMatrix[j + (nCols * i)] = 1.0; + i += lenPq; + if (i >= nRows) { i = (i % nRows) + 1; } + } + } + } +} + +// show centers (for debuging) +void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const uint32_t* centerSize, + const int numShow = 5) +{ + for (uint64_t k = 0; k < numCenters; k++) { + if ((numShow <= k) && (k < numCenters - numShow)) { + if (k == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# centers[%lu]:", k); + for (uint64_t j = 0; j < dimCenters; j++) { + if ((numShow <= j) && (j < dimCenters - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %f,", centers[j + (dimCenters * k)]); + } + fprintf(stderr, " %d\n", centerSize[k]); + } +} + +// show dataset (for debugging) +void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] + uint32_t numDataset, + uint32_t dimDataset, + const int numShow = 5) +{ + for (uint64_t i = 0; i < numDataset; i++) { + if ((numShow <= i) && (i < numDataset - numShow)) { + if (i == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# dataset[%lu]:", i); + for (uint64_t j = 0; j < dimDataset; j++) { + if ((numShow <= j) && (j < dimDataset - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %.3f,", dataset[j + (dimDataset * i)]); + } + fprintf(stderr, "\n"); + } +} + +// show pq code (for debuging) +void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] + uint32_t numDataset, + uint32_t dimPq, + const int numShow = 5) +{ + for (uint64_t i = 0; i < numDataset; i++) { + if ((numShow <= i) && (i < numDataset - numShow)) { + if (i == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# dataset[%lu]:", i); + for (uint64_t j = 0; j < dimPq; j++) { + if ((numShow <= j) && (j < dimPq - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %u,", pqDataset[j + (dimPq * i)]); + } + fprintf(stderr, "\n"); + } +} + +// +int _cuann_set_device(int devId) +{ + int orgDevId; + cudaError_t cudaError = cudaGetDevice(&orgDevId); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaGetDevice() failed (%d)\n", __func__, __LINE__, cudaError); + exit(-1); + } + cudaError = cudaSetDevice(devId); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaSetDevice() failed (%d)\n", __func__, __LINE__, cudaError); + exit(-1); + } + return orgDevId; +} + +// +uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) +{ + return min(clusterSize * dimPq, 256 * max(1 << bitPq, dimPq)); +} + +// +void _cuann_compute_PQ_code(cuannHandle_t handle, + uint32_t numDataset, + uint32_t dimDataset, + uint32_t dimRotDataset, + uint32_t dimPq, + uint32_t lenPq, + uint32_t bitPq, + uint32_t numClusters, + cudaDataType_t dtype, + cuannPqCenter_t typePqCenter, + uint32_t maxClusterSize, + float* clusterCenters, // [numClusters, dimDataset] + const float* rotationMatrix, // [dimRotDataset, dimDataset] + const void* dataset, // [numDataset] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterSize, // [numClusters] + const uint32_t* indexPtr, // [numClusters + 1] + float* pqCenters, // [...] + uint32_t numIterations, + uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] +) +{ + // + // Compute PQ code + // + memset(pqDataset, 0, sizeof(uint8_t) * numDataset * dimPq * bitPq / 8); + float** resVectors; // [numDevices][maxClusterSize, dimDataset] + float** rotVectors; // [numDevices][maxClusterSize, dimRotDataset] + float** subVectors; // [numDevices][dimPq, maxClusterSize, lenPq] + uint32_t** subVectorLabels; // [numDevices][dimPq, maxClusterSize] + uint8_t** myPqDataset; // [numDevices][maxCluserSize, dimPq * bitPq / 8] + resVectors = _cuann_multi_device_malloc( + handle->numDevices, maxClusterSize * dimDataset, "resVectors"); + rotVectors = _cuann_multi_device_malloc( + handle->numDevices, maxClusterSize * dimRotDataset, "rotVectors"); + subVectors = _cuann_multi_device_malloc( + handle->numDevices, dimPq * maxClusterSize * lenPq, "subVectors"); + subVectorLabels = _cuann_multi_device_malloc( + handle->numDevices, dimPq * maxClusterSize, "subVectorLabels"); + myPqDataset = _cuann_multi_device_malloc( + handle->numDevices, maxClusterSize * dimPq * bitPq / 8, "myPqDataset"); + + uint32_t maxTrainset = 0; + if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { + maxTrainset = _get_num_trainset(maxClusterSize, dimPq, bitPq); + } + void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( + handle->numDevices, + _cuann_kmeans_predict_bufferSize((1 << bitPq), lenPq, max(maxClusterSize, maxTrainset)), + "pqPredictWorkspace"); + + uint32_t** rotVectorLabels; // [numDevices][maxClusterSize, dimPq,] + uint32_t** pqClusterSize; // [numDevices][1 << bitPq,] + uint32_t** wsKAC; // [numDevices][1] + float** myPqCenters; // [numDevices][1 << bitPq, lenPq] + float** myPqCentersTemp; // [numDevices][1 << bitPq, lenPq] + if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { + memset(pqCenters, 0, sizeof(float) * numClusters * (1 << bitPq) * lenPq); + rotVectorLabels = _cuann_multi_device_malloc( + handle->numDevices, maxClusterSize * dimPq, "rotVectorLabels"); + pqClusterSize = + _cuann_multi_device_malloc(handle->numDevices, (1 << bitPq), "pqClusterSize"); + wsKAC = _cuann_multi_device_malloc(handle->numDevices, 1, "wsKAC"); + myPqCenters = + _cuann_multi_device_malloc(handle->numDevices, (1 << bitPq) * lenPq, "myPqCenters"); + myPqCentersTemp = _cuann_multi_device_malloc( + handle->numDevices, (1 << bitPq) * lenPq, "myPqCentersTemp"); + } + +#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) + for (uint32_t l = 0; l < numClusters; l++) { + int devId = omp_get_thread_num(); + cudaSetDevice(devId); + if (devId == 0) { + fprintf(stderr, "(%s) Making PQ dataset: %u / %u \r", __func__, l, numClusters); + } + if (clusterSize[l] == 0) continue; + + // + // Compute the residual vector of the new vector with its cluster + // centroids. + // resVectors[..] = newVectors[..] - clusterCenters[..] + // + if (dtype == CUDA_R_32F) { + _cuann_copy_with_list(clusterSize[l], + dimDataset, + (float*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset); + } else if (dtype == CUDA_R_8U) { + const float divisor = 256.0; + _cuann_copy_with_list(clusterSize[l], + dimDataset, + (uint8_t*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + divisor); + } else if (dtype == CUDA_R_8I) { + const float divisor = 128.0; + _cuann_copy_with_list(clusterSize[l], + dimDataset, + (int8_t*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + divisor); + } + _cuann_a_me_b(clusterSize[l], + dimDataset, + resVectors[devId], + dimDataset, + clusterCenters + (uint64_t)l * dimDataset); + + // + // Rotate the residual vectors using a rotation matrix + // + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle->cublasHandles[devId], NULL); + float alpha = 1.0; + float beta = 0.0; + cublasStatus_t cublasError = cublasGemmEx(handle->cublasHandles[devId], + CUBLAS_OP_T, + CUBLAS_OP_N, + dimRotDataset, + clusterSize[l], + dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + dimDataset, + resVectors[devId], + CUDA_R_32F, + dimDataset, + &beta, + rotVectors[devId], + CUDA_R_32F, + dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); + // return CUANN_STATUS_CUBLAS_ERROR; + exit(-1); + } + _cuann_set_cublas_stream(handle->cublasHandles[devId], cublasStream); + + // + // Training PQ codebook if CUANN_PQ_CENTER_PER_CLUSTER + // (*) PQ codebooks are trained for each cluster. + // + if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { + uint32_t numTrainset = _get_num_trainset(clusterSize[l], dimPq, bitPq); + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + if (devId == 0) { + fprintf(stderr, + "(%s) Making PQ dataset: %u / %u, " + "Training PQ codebook (%u): %.1f / %u \r", + __func__, + l, + numClusters, + numTrainset, + (float)iter / 2, + numIterations); + } + _cuann_kmeans_predict(handle->cublasHandles[devId], + myPqCenters[devId], + (1 << bitPq), + lenPq, + rotVectors[devId], + CUDA_R_32F, + numTrainset, + rotVectorLabels[devId], + CUANN_SIMILARITY_L2, + (iter != 0), + pqPredictWorkspace[devId], + myPqCentersTemp[devId], + pqClusterSize[devId], + true); + if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(myPqCenters[devId], + (1 << bitPq), + lenPq, + rotVectors[devId], + CUDA_R_32F, + numTrainset, + rotVectorLabels[devId], + CUANN_SIMILARITY_L2, + pqClusterSize[devId], + (float)1.0 / 4, + wsKAC[devId])) { + iter -= 1; + } + } + cudaMemcpy(pqCenters + ((1 << bitPq) * lenPq) * l, + myPqCenters[devId], + sizeof(float) * (1 << bitPq) * lenPq, + cudaMemcpyDeviceToHost); + } + + // + // Change the order of the vector data to facilitate processing in + // each vector subspace. + // input: rotVectors[clusterSize, dimRotDataset] + // output: subVectors[dimPq, clusterSize, lenPq] + // + _cuann_transpose_copy_3d(lenPq, + clusterSize[l], + dimPq, + subVectors[devId], + lenPq, + clusterSize[l], + rotVectors[devId], + 1, + dimRotDataset, + lenPq); + + // + // Find a label (cluster ID) for each vector subspace. + // + for (uint32_t j = 0; j < dimPq; j++) { + float* curPqCenters = NULL; + if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * j; + } else if (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER) { + curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * l; + if (numIterations > 0) { curPqCenters = myPqCenters[devId]; } + } + _cuann_kmeans_predict(handle->cublasHandles[devId], + curPqCenters, + (1 << bitPq), + lenPq, + subVectors[devId] + j * (clusterSize[l] * lenPq), + CUDA_R_32F, + clusterSize[l], + subVectorLabels[devId] + j * clusterSize[l], + CUANN_SIMILARITY_L2, + true, + pqPredictWorkspace[devId], + nullptr, + nullptr, + true); + } + + // + // PQ encoding + // + ivfpq_encode( + clusterSize[l], clusterSize[l], dimPq, bitPq, subVectorLabels[devId], myPqDataset[devId]); + cudaMemcpy(pqDataset + ((uint64_t)indexPtr[l] * dimPq * bitPq / 8), + myPqDataset[devId], + sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, + cudaMemcpyDeviceToHost); + // cudaDeviceSynchronize(); + } + cudaDeviceSynchronize(); + fprintf(stderr, "\n"); + + // + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, handle->numDevices); + _cuann_multi_device_free(myPqDataset, handle->numDevices); + _cuann_multi_device_free(subVectorLabels, handle->numDevices); + _cuann_multi_device_free(subVectors, handle->numDevices); + _cuann_multi_device_free(rotVectors, handle->numDevices); + _cuann_multi_device_free(resVectors, handle->numDevices); + if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { + _cuann_multi_device_free(wsKAC, handle->numDevices); + _cuann_multi_device_free(rotVectorLabels, handle->numDevices); + _cuann_multi_device_free(pqClusterSize, handle->numDevices); + _cuann_multi_device_free(myPqCenters, handle->numDevices); + _cuann_multi_device_free(myPqCentersTemp, handle->numDevices); + } +} + +// cuannCreate +cuannStatus_t cuannCreate(cuannHandle_t* handle) +{ + cudaError_t cudaError; + cublasStatus_t cublasError; + + *handle = (cuannHandle_t)malloc(sizeof(struct cuannContext)); + if (*handle == NULL) { return CUANN_STATUS_ALLOC_FAILED; } + + // Keep the current device ID. + int devId; + cudaError = cudaGetDevice(&devId); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaGetDevice() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } + + // numDevices + cudaGetDeviceCount(&((*handle)->numDevices)); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaGetDeviceCount() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } + + (*handle)->streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * (*handle)->numDevices); + (*handle)->deviceProps = (cudaDeviceProp*)malloc(sizeof(cudaDeviceProp) * (*handle)->numDevices); + (*handle)->cublasHandles = + (cublasHandle_t*)malloc(sizeof(cublasHandle_t) * (*handle)->numDevices); + + for (int i = 0; i < (*handle)->numDevices; i++) { + cudaError = cudaSetDevice(i); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaSetDevice() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } + + // stream + (*handle)->streams[i] = NULL; + + // deviceProp + cudaError = cudaGetDeviceProperties(&((*handle)->deviceProps[i]), i); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaGetDeviceProperties() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } + + // cublasHandle + cublasError = cublasCreate(&((*handle)->cublasHandles[i])); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasCreate() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUBLAS_ERROR; + } + } + + return cuannSetDevice(*handle, devId); +} + +// cuannDestroy +cuannStatus_t cuannDestroy(cuannHandle_t handle) +{ + if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + cublasStatus_t cublasError; + for (int i = 0; i < handle->numDevices; i++) { + cublasError = cublasDestroy(handle->cublasHandles[i]); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasDestroy() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUBLAS_ERROR; + } + } + free(handle->streams); + free(handle->deviceProps); + free(handle->cublasHandles); + free(handle); + return CUANN_STATUS_SUCCESS; +} + +// cuannSetStream +cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream) +{ + if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int devId = handle->devId; + cublasSetStream(handle->cublasHandles[devId], stream); + handle->streams[devId] = stream; + + return cuannSetDevice(handle, devId); +} + +// cuannSetDevice +cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId) +{ + if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (devId < 0 || devId >= handle->numDevices) { + fprintf( + stderr, "(%s, %d) devId is out of range (devId:%d) failed.\n", __func__, __LINE__, devId); + return CUANN_STATUS_INVALID_VALUE; + } + + // (*) Need to re-consider whether it is good to call cudaSetDevice() here. + cudaError_t cudaError = cudaSetDevice(devId); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaSetDevice() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } + + handle->devId = devId; + handle->stream = handle->streams[devId]; + handle->deviceProp = handle->deviceProps[devId]; + handle->cublasHandle = handle->cublasHandles[devId]; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqCreateDescriptor +cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) +{ + *desc = (cuannIvfPqDescriptor_t)malloc(sizeof(struct cuannIvfPqDescriptor)); + if (*desc == NULL) { return CUANN_STATUS_ALLOC_FAILED; } + (*desc)->numClusters = 0; + (*desc)->numDataset = 0; + (*desc)->dimDataset = 0; + (*desc)->dimDatasetExt = 0; + (*desc)->dimRotDataset = 0; + (*desc)->dimPq = 0; + (*desc)->bitPq = 0; + (*desc)->numProbes = 0; + (*desc)->topK = 0; + (*desc)->maxQueries = 0; + (*desc)->maxBatchSize = 0; + (*desc)->maxSamples = 0; + (*desc)->inclusiveSumSortedClusterSize = NULL; + (*desc)->sqsumClusters = NULL; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqDestroyDescriptor +cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (desc->sqsumClusters != NULL) { cudaFree(desc->sqsumClusters); } + free(desc); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSetIndexParameters +cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numClusters, + const uint32_t numDataset, + const uint32_t dimDataset, + const uint32_t dimPq, + const uint32_t bitPq, + const cuannSimilarity_t similarity, + const cuannPqCenter_t typePqCenter) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (numClusters == 0) { + fprintf( + stderr, "(%s) numClusters must be larger than zero (dimDataset:%u).\n", __func__, dimDataset); + return CUANN_STATUS_INVALID_VALUE; + } + if (numDataset == 0) { + fprintf( + stderr, "(%s) numDataset must be larger than zero (numDataset:%u).\n", __func__, numDataset); + return CUANN_STATUS_INVALID_VALUE; + } + if (dimDataset == 0) { + fprintf( + stderr, "(%s) dimDataset must be larger than zero (dimDataset:%u).\n", __func__, dimDataset); + return CUANN_STATUS_INVALID_VALUE; + } + if (dimPq == 0) { + fprintf(stderr, "(%s) dimPq must be larger than zero (dimPq:%u).\n", __func__, dimPq); + return CUANN_STATUS_INVALID_VALUE; + } + if (numClusters > numDataset) { + fprintf(stderr, + "(%s) numClusters must be smaller than numDataset (numClusters:%u, numDataset:%u).\n", + __func__, + numClusters, + numDataset); + return CUANN_STATUS_INVALID_VALUE; + } + if (bitPq < 4 || bitPq > 8) { + fprintf(stderr, "(%s) bitPq must be 4, 5, 6, 7 or 8 (bitPq:%u)\n", __func__, bitPq); + return CUANN_STATUS_INVALID_VALUE; + } + if (bitPq == 4 && dimPq % 2 != 0) { + fprintf(stderr, + "(%s) dimPq must be multiple of 2 when bitPq is 4 (dimPq:%u, bitPq:%u)\n", + __func__, + dimPq, + bitPq); + return CUANN_STATUS_INVALID_VALUE; + } + if (bitPq == 5 && dimPq % 8 != 0) { + fprintf(stderr, + "(%s) dimPq must be multiple of 8 when bitPq is 5 (dimPq:%u, bitPq:%u)\n", + __func__, + dimPq, + bitPq); + return CUANN_STATUS_INVALID_VALUE; + } + if (bitPq == 6 && dimPq % 4 != 0) { + fprintf(stderr, + "(%s) dimPq must be multiple of 4 when bitPq is 6 (dimPq:%u, bitPq:%u)\n", + __func__, + dimPq, + bitPq); + return CUANN_STATUS_INVALID_VALUE; + } + if (bitPq == 7 && dimPq % 8 != 0) { + fprintf(stderr, + "(%s) dimPq must be multiple of 8 when bitPq is 7 (dimPq:%u, bitPq:%u)\n", + __func__, + dimPq, + bitPq); + return CUANN_STATUS_INVALID_VALUE; + } + desc->numClusters = numClusters; + desc->numDataset = numDataset; + desc->dimDataset = dimDataset; + desc->dimDatasetExt = dimDataset + 1; + if (desc->dimDatasetExt % 8) { desc->dimDatasetExt += 8 - (desc->dimDatasetExt % 8); } + assert(desc->dimDatasetExt >= dimDataset + 1); + assert(desc->dimDatasetExt % 8 == 0); + desc->dimPq = dimPq; + desc->bitPq = bitPq; + desc->similarity = similarity; + desc->typePqCenter = typePqCenter; + + desc->dimRotDataset = dimDataset; + if (dimDataset % dimPq) { desc->dimRotDataset = ((dimDataset / dimPq) + 1) * dimPq; } + desc->lenPq = desc->dimRotDataset / dimPq; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqGetIndexParameters +cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + + *numClusters = desc->numClusters; + *numDataset = desc->numDataset; + *dimDataset = desc->dimDataset; + *dimPq = desc->dimPq; + *bitPq = desc->bitPq; + *similarity = desc->similarity; + *typePqCenter = desc->typePqCenter; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqGetIndexSize +cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + + *size = sizeof(struct cuannIvfPqIndexHeader); + if (*size != 1024) { + fprintf(stderr, "(%s, %d) Unexpected Error!\n", __func__, __LINE__); + exit(-1); + } + *size += _cuann_getIndexSize_clusterCenters(desc); + *size += _cuann_getIndexSize_pqCenters(desc); + *size += _cuann_getIndexSize_pqDataset(desc); + *size += _cuann_getIndexSize_originalNumbers(desc); + *size += _cuann_getIndexSize_indexPtr(desc); + *size += _cuann_getIndexSize_rotationMatrix(desc); + *size += _cuann_getIndexSize_clusterRotCenters(desc); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqBuildIndex +cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* dataset, + const void* trainset, + cudaDataType_t dtype, + uint32_t numTrainset, + uint32_t numIterations, + bool randomRotation, + bool hierarchicalClustering, + void* index) +{ + if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int cuannDevId = handle->devId; + int callerDevId = _cuann_set_device(cuannDevId); + + if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + if (desc->similarity == CUANN_SIMILARITY_INNER && dtype != CUDA_R_32F) { + fprintf( + stderr, "(%s, %d) CUANN_SIMILARITY_INNER supports float dtype only.\n", __func__, __LINE__); + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + desc->dtypeDataset = dtype; + char dtypeString[64]; + fprintf(stderr, "# dtypeDataset: %s\n", _cuann_get_dtype_string(desc->dtypeDataset, dtypeString)); + + cudaError_t cudaError; + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, dataset); + if (attr.type == cudaMemoryTypeDevice) { + fprintf(stderr, "(%s) dataset must be accessible from the host.\n", __func__); + return CUANN_STATUS_INVALID_POINTER; + } + cudaPointerGetAttributes(&attr, trainset); + if (attr.type == cudaMemoryTypeDevice) { + fprintf(stderr, "(%s) trainset must be accessible from the host.\n", __func__); + return CUANN_STATUS_INVALID_POINTER; + } + + struct cuannIvfPqIndexHeader* header; + float* clusterCenters; // [numClusters, dimDataset] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* indexPtr; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + index, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &indexPtr, + &rotationMatrix, + &clusterRotCenters); + + uint32_t* trainsetLabels; // [numTrainset] + cudaError = cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + uint32_t* clusterSize; // [numClusters] + cudaError = cudaMallocManaged(&clusterSize, sizeof(uint32_t) * desc->numClusters); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + float* clusterCentersTemp; // [numClusters, dimDataset] + cudaError = + cudaMallocManaged(&clusterCentersTemp, sizeof(float) * desc->numClusters * desc->dimDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + uint32_t** wsKAC = _cuann_multi_device_malloc(handle->numDevices, 1, "wsKAC"); + + // + // Training kmeans + // + fprintf(stderr, "# hierarchicalClustering: %u\n", hierarchicalClustering); + if (hierarchicalClustering) { + // Hierarchical kmeans + uint32_t numMesoClusters = pow((double)(desc->numClusters), (double)1.0 / 2.0) + 0.5; + fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters); + + float* mesoClusterCenters; // [numMesoClusters, dimDataset] + cudaError = + cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * desc->dimDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + float* mesoClusterCentersTemp; // [numMesoClusters, dimDataset] + cudaError = cudaMallocManaged(&mesoClusterCentersTemp, + sizeof(float) * numMesoClusters * desc->dimDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + uint32_t* mesoClusterLabels; // [numTrainset,] + cudaError = cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + uint32_t* mesoClusterSize; // [numMesoClusters,] + cudaError = cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + // + // Training kmeans for meso-clusters + // + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + fprintf(stderr, + "(%s) " + "Training kmeans for meso-clusters: %.1f / %u \r", + __func__, + (float)iter / 2, + numIterations); + _cuann_kmeans_predict(handle->cublasHandle, + mesoClusterCenters, + numMesoClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + mesoClusterLabels, + desc->similarity, + (iter != 0), + NULL, + mesoClusterCentersTemp, + mesoClusterSize, + true); + if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(mesoClusterCenters, + numMesoClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + mesoClusterLabels, + desc->similarity, + mesoClusterSize, + (float)1.0 / 4, + nullptr)) { + iter -= 1; + } + } + fprintf(stderr, "\n"); + cudaDeviceSynchronize(); + + // Number of centers in each meso cluster + // [numMesoClusters,] + uint32_t* numFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * numMesoClusters); + + // [numMesoClusters + 1,] + uint32_t* csumFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * (numMesoClusters + 1)); + csumFineClusters[0] = 0; + + uint32_t numClustersRemain = desc->numClusters; + uint32_t numTrainsetRemain = numTrainset; + uint32_t mesoClusterSizeSum = 0; // check + uint32_t mesoClusterSizeMax = 0; + uint32_t numFineClustersMax = 0; + for (uint32_t i = 0; i < numMesoClusters; i++) { + if (i < numMesoClusters - 1) { + numFineClusters[i] = + (double)numClustersRemain * mesoClusterSize[i] / numTrainsetRemain + .5; + } else { + numFineClusters[i] = numClustersRemain; + } + csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i]; + + numClustersRemain -= numFineClusters[i]; + numTrainsetRemain -= mesoClusterSize[i]; + mesoClusterSizeSum += mesoClusterSize[i]; + mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]); + numFineClustersMax = max(numFineClustersMax, numFineClusters[i]); + } + assert(mesoClusterSizeSum == numTrainset); + assert(csumFineClusters[numMesoClusters] == desc->numClusters); + + uint32_t** idsTrainset = + _cuann_multi_device_malloc(handle->numDevices, mesoClusterSizeMax, "idsTrainset"); + + float** subTrainset = _cuann_multi_device_malloc( + handle->numDevices, mesoClusterSizeMax * desc->dimDataset, "subTrainset"); + + // label (cluster ID) of each vector + uint32_t** labelsMP = + _cuann_multi_device_malloc(handle->numDevices, mesoClusterSizeMax, "labelsMP"); + + float** clusterCentersEach = _cuann_multi_device_malloc( + handle->numDevices, numFineClustersMax * desc->dimDataset, "clusterCentersEach"); + + float** clusterCentersMP = _cuann_multi_device_malloc( + handle->numDevices, numFineClustersMax * desc->dimDataset, "clusterCentersMP"); + + // number of vectors in each cluster + uint32_t** clusterSizeMP = + _cuann_multi_device_malloc(handle->numDevices, numFineClustersMax, "clusterSizeMP"); + + size_t sizePredictWorkspace = 0; + for (uint32_t i = 0; i < numMesoClusters; i++) { + sizePredictWorkspace = + max(sizePredictWorkspace, + _cuann_kmeans_predict_bufferSize(numFineClusters[i], // number of centers + desc->dimDataset, + mesoClusterSize[i] // number of vectors + )); + } + void** predictWorkspace = (void**)_cuann_multi_device_malloc( + handle->numDevices, sizePredictWorkspace, "predictWorkspace"); + + // + // Training kmeans for clusters in each meso-clusters + // +#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) + for (uint32_t i = 0; i < numMesoClusters; i++) { + int devId = omp_get_thread_num(); + cudaSetDevice(devId); + + uint32_t k = 0; + for (uint32_t j = 0; j < numTrainset; j++) { + if (mesoClusterLabels[j] != i) continue; + idsTrainset[devId][k++] = j; + } + assert(k == mesoClusterSize[i]); + + if (dtype == CUDA_R_32F) { + _cuann_copy_with_list(mesoClusterSize[i], + desc->dimDataset, + (const float*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset); + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + _cuann_copy_with_list(mesoClusterSize[i], + desc->dimDataset, + (const uint8_t*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + _cuann_copy_with_list(mesoClusterSize[i], + desc->dimDataset, + (const int8_t*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + divisor); + } + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + if (devId == 0) { + fprintf(stderr, + "(%s) Training kmeans for clusters in " + "meso-cluster %u (numClusters: %u): %.1f / %u \r", + __func__, + i, + numFineClusters[i], + (float)iter / 2, + numIterations); + } + _cuann_kmeans_predict(handle->cublasHandles[devId], + clusterCentersEach[devId], + numFineClusters[i], + desc->dimDataset, + subTrainset[devId], + CUDA_R_32F, + mesoClusterSize[i], + labelsMP[devId], + desc->similarity, + (iter != 0), + predictWorkspace[devId], + clusterCentersMP[devId], + clusterSizeMP[devId], + true); + if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(clusterCentersEach[devId], + numFineClusters[i], + desc->dimDataset, + subTrainset[devId], + CUDA_R_32F, + mesoClusterSize[i], + labelsMP[devId], + desc->similarity, + clusterSizeMP[devId], + (float)1.0 / 4, + wsKAC[devId])) { + iter -= 1; + } + } + cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), + clusterCentersEach[devId], + sizeof(float) * numFineClusters[i] * desc->dimDataset, + cudaMemcpyDeviceToDevice); + } + for (int devId = 0; devId < handle->numDevices; devId++) { + cudaSetDevice(devId); + cudaDeviceSynchronize(); + } + fprintf(stderr, "\n"); + cudaSetDevice(cuannDevId); + + _cuann_multi_device_free(idsTrainset, handle->numDevices); + _cuann_multi_device_free(subTrainset, handle->numDevices); + _cuann_multi_device_free(labelsMP, handle->numDevices); + _cuann_multi_device_free(clusterCentersEach, handle->numDevices); + _cuann_multi_device_free(clusterCentersMP, handle->numDevices); + _cuann_multi_device_free(clusterSizeMP, handle->numDevices); + _cuann_multi_device_free((uint8_t**)predictWorkspace, handle->numDevices); + + cudaFree(mesoClusterSize); + cudaFree(mesoClusterLabels); + cudaFree(mesoClusterCenters); + cudaFree(mesoClusterCentersTemp); + + free(numFineClusters); + free(csumFineClusters); + + // + // Fine-tuning kmeans for whole clusters (with multipel GPUs) + // + // (*) Since the likely cluster centroids have been calculated + // hierarchically already, the number of iteration for fine-tuning + // kmeans for whole clusters should be reduced. However, there + // is a possibility that the clusters could be unbalanced here, + // in which case the actual number of iterations would be increased. + // + const int X = 5; + int numIterations_X = max(numIterations / 10, 2) * X; + for (int iter = 0; iter < numIterations_X; iter += X) { + fprintf(stderr, + "(%s) " + "Fine-tuning kmeans for whole clusters: %.1f / %d \r", + __func__, + (float)iter / X, + numIterations_X / X); + _cuann_kmeans_predict_MP(handle->numDevices, + handle->cublasHandles, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->similarity, + true, + clusterSize, + true /* to update clusterCenters */); + if ((iter + 1 < numIterations_X) && _cuann_kmeans_adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->similarity, + clusterSize, + (float)1.0 / 5, + nullptr)) { + iter -= (X - 1); + } + } + fprintf(stderr, "\n"); + } else { + // Flat kmeans + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + fprintf( + stderr, "(%s) Training kmeans: %.1f / %u \r", __func__, (float)iter / 2, numIterations); + _cuann_kmeans_predict(handle->cublasHandle, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->similarity, + (iter != 0), + NULL, + clusterCentersTemp, + clusterSize, + true); + if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->similarity, + clusterSize, + (float)1.0 / 4, + nullptr)) { + iter -= 1; + } + } + fprintf(stderr, "\n"); + } + + uint32_t* datasetLabels; // [numDataset] + cudaError = cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * desc->numDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + + // + // Predict labels of whole dataset (with multiple GPUs) + // + fprintf(stderr, "(%s) Final fitting\n", __func__); + _cuann_kmeans_predict_MP(handle->numDevices, + handle->cublasHandles, + clusterCenters, + desc->numClusters, + desc->dimDataset, + dataset, + dtype, + desc->numDataset, + datasetLabels, + desc->similarity, + true, + clusterSize, + true /* to update clusterCenters */); + +#ifdef CUANN_DEBUG + cudaDeviceSynchronize(); + _cuann_kmeans_show_centers(clusterCenters, desc->numClusters, desc->dimDataset, clusterSize); +#endif + + // Make rotation matrix + fprintf(stderr, "# dimDataset: %u\n", desc->dimDataset); + fprintf(stderr, "# dimRotDataset: %u\n", desc->dimRotDataset); + fprintf(stderr, "# randomRotation: %u\n", randomRotation); + _cuann_make_rotation_matrix( + desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); + + // Rotate clusterCenters + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle->cublasHandle, NULL); + float alpha = 1.0; + float beta = 0.0; + cublasStatus_t cublasError = cublasGemmEx(handle->cublasHandle, + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->dimRotDataset, + desc->numClusters, + desc->dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + desc->dimDataset, + clusterCenters, + CUDA_R_32F, + desc->dimDataset, + &beta, + clusterRotCenters, + CUDA_R_32F, + desc->dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUBLAS_ERROR; + } + _cuann_set_cublas_stream(handle->cublasHandle, cublasStream); + + // + // Make indexPtr, originalNumbers and pqDataset + // + uint32_t maxClusterSize = 0; + // indexPtr + indexPtr[0] = 0; + for (uint32_t l = 0; l < desc->numClusters; l++) { + indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; + if (maxClusterSize < clusterSize[l]) { maxClusterSize = clusterSize[l]; } + } + if (indexPtr[desc->numClusters] != desc->numDataset) { + fprintf(stderr, "(%s, %d) Unexpected Error.\n", __func__, __LINE__); + return CUANN_STATUS_INTERNAL_ERROR; + } + desc->maxClusterSize = maxClusterSize; + // fprintf(stderr, "(%s) maxClusterSize: %u\n", __func__, maxClusterSize); + + // originalNumbers + for (uint32_t i = 0; i < desc->numDataset; i++) { + uint32_t l = datasetLabels[i]; + originalNumbers[indexPtr[l]] = i; + indexPtr[l] += 1; + } + + // Recover indexPtr + for (uint32_t l = 0; l < desc->numClusters; l++) { + indexPtr[l] -= clusterSize[l]; + } + + // [numDevices][1 << bitPq, lenPq] + float** pqCentersTemp = _cuann_multi_device_malloc( + handle->numDevices, (1 << desc->bitPq) * desc->lenPq, "pqCentersTemp"); + + // [numDevices][1 << bitPq,] + uint32_t** pqClusterSize = + _cuann_multi_device_malloc(handle->numDevices, (1 << desc->bitPq), "pqClusterSize"); + + // Allocate workspace for PQ codebook training + size_t sizePqPredictWorkspace = + _cuann_kmeans_predict_bufferSize((1 << desc->bitPq), desc->lenPq, numTrainset); + sizePqPredictWorkspace = max(sizePqPredictWorkspace, + _cuann_kmeans_predict_bufferSize( + (1 << desc->bitPq), desc->lenPq, maxClusterSize * desc->dimPq)); + void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( + handle->numDevices, sizePqPredictWorkspace, "pqPredictWorkspace"); + + if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + // + // Training PQ codebook (CUANN_PQ_CENTER_PER_SUBSPACE) + // (*) PQ codebooks are trained for each subspace. + // + + // Predict label of trainset again (with multiple GPUs) + fprintf(stderr, "(%s) Predict label of trainset again\n", __func__); + _cuann_kmeans_predict_MP(handle->numDevices, + handle->cublasHandles, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->similarity, + true, + NULL, + false /* do not update clusterCenters */); + + // [dimPq, numTrainset, lenPq] + size_t sizeModTrainset = sizeof(float) * desc->dimPq * numTrainset * desc->lenPq; + float* modTrainset = (float*)malloc(sizeModTrainset); + memset(modTrainset, 0, sizeModTrainset); + + // modTrainset[] = transpose( rotate(trainset[]) - clusterRotCenters[] ) +#pragma omp parallel for + for (uint32_t i = 0; i < numTrainset; i++) { + uint32_t l = trainsetLabels[i]; + for (uint32_t j = 0; j < desc->dimRotDataset; j++) { + float val; + if (dtype == CUDA_R_32F) { + val = + _cuann_dot(desc->dimDataset, + (float*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1); + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + val = _cuann_dot( + desc->dimDataset, + (uint8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + val = + _cuann_dot(desc->dimDataset, + (int8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1, + divisor); + } + uint32_t j0 = j / (desc->lenPq); // 0 <= j0 < dimPq + uint32_t j1 = j % (desc->lenPq); // 0 <= j1 < lenPq + uint64_t idx = + j1 + ((uint64_t)(desc->lenPq) * i) + ((uint64_t)(desc->lenPq) * numTrainset * j0); + modTrainset[idx] = val - clusterRotCenters[j + (desc->dimRotDataset * l)]; + } + } + + // [numDevices][numTrainset, lenPq] + float** subTrainset = _cuann_multi_device_malloc( + handle->numDevices, numTrainset * desc->lenPq, "subTrainset"); + + // [numDevices][numTrainset] + uint32_t** subTrainsetLabels = + _cuann_multi_device_malloc(handle->numDevices, numTrainset, "subTrainsetLabels"); + + float** pqCentersEach = _cuann_multi_device_malloc( + handle->numDevices, ((1 << desc->bitPq) * desc->lenPq), "pqCentersEach"); + +#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) + for (uint32_t j = 0; j < desc->dimPq; j++) { + int devId = omp_get_thread_num(); + cudaSetDevice(devId); + + float* curPqCenters = pqCenters + ((1 << desc->bitPq) * desc->lenPq) * j; + cudaMemcpy(subTrainset[devId], + modTrainset + ((uint64_t)numTrainset * desc->lenPq * j), + sizeof(float) * numTrainset * desc->lenPq, + cudaMemcpyHostToDevice); + // Train kmeans for each PQ + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + if (devId == 0) { + fprintf(stderr, + "(%s) Training PQ codebook %u (out of %u): " + "%.1f / %u \r", + __func__, + j, + desc->dimPq, + (float)iter / 2, + numIterations); + } + _cuann_kmeans_predict(handle->cublasHandles[devId], + pqCentersEach[devId], + (1 << desc->bitPq), + desc->lenPq, + subTrainset[devId], + CUDA_R_32F, + numTrainset, + subTrainsetLabels[devId], + CUANN_SIMILARITY_L2, + (iter != 0), + pqPredictWorkspace[devId], + pqCentersTemp[devId], + pqClusterSize[devId], + true); + if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(pqCentersEach[devId], + (1 << desc->bitPq), + desc->lenPq, + subTrainset[devId], + CUDA_R_32F, + numTrainset, + subTrainsetLabels[devId], + CUANN_SIMILARITY_L2, + pqClusterSize[devId], + (float)1.0 / 4, + wsKAC[devId])) { + iter -= 1; + } + } + cudaMemcpy(curPqCenters, + pqCentersEach[devId], + sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), + cudaMemcpyDeviceToDevice); +#ifdef CUANN_DEBUG + if (j == 0) { + cudaDeviceSynchronize(); + _cuann_kmeans_show_centers( + curPqCenters, (1 << desc->bitPq), desc->lenPq, pqClusterSize[devId]); + } +#endif + } + fprintf(stderr, "\n"); + cudaSetDevice(cuannDevId); + + _cuann_multi_device_free(subTrainset, handle->numDevices); + _cuann_multi_device_free(subTrainsetLabels, handle->numDevices); + _cuann_multi_device_free(pqCentersEach, handle->numDevices); + free(modTrainset); + } + + // + // Compute PQ code for whole dataset + // + _cuann_compute_PQ_code(handle, + desc->numDataset, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq, + desc->lenPq, + desc->bitPq, + desc->numClusters, + dtype, + desc->typePqCenter, + maxClusterSize, + clusterCenters, + rotationMatrix, + dataset, + originalNumbers, + clusterSize, + indexPtr, + pqCenters, + numIterations, + pqDataset); + cudaSetDevice(cuannDevId); + + // + _cuann_get_inclusiveSumSortedClusterSize( + desc, indexPtr, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + _cuann_get_sqsumClusters(desc, clusterCenters, &(desc->sqsumClusters)); + + { + // combine clusterCenters and sqsumClusters + cudaDeviceSynchronize(); + float* tmpClusterCenters; // [numClusters, dimDataset] + cudaError = + cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + for (uint32_t i = 0; i < desc->numClusters * desc->dimDataset; i++) { + tmpClusterCenters[i] = clusterCenters[i]; + } + for (uint32_t i = 0; i < desc->numClusters; i++) { + for (uint32_t j = 0; j < desc->dimDataset; j++) { + clusterCenters[j + (desc->dimDatasetExt * i)] = + tmpClusterCenters[j + (desc->dimDataset * i)]; + } + clusterCenters[desc->dimDataset + (desc->dimDatasetExt * i)] = desc->sqsumClusters[i]; + } + cudaFree(tmpClusterCenters); + } + + // + cuannIvfPqGetIndexSize(desc, &(header->indexSize)); + header->similarity = desc->similarity; + header->numClusters = desc->numClusters; + header->numDataset = desc->numDataset; + header->dimDataset = desc->dimDataset; + header->dimPq = desc->dimPq; + header->maxClusterSize = maxClusterSize; + header->dimRotDataset = desc->dimRotDataset; + header->bitPq = desc->bitPq; + header->typePqCenter = desc->typePqCenter; + header->dtypeDataset = desc->dtypeDataset; + header->dimDatasetExt = desc->dimDatasetExt; + header->numDatasetAdded = 0; + + // + cudaFree(clusterSize); + cudaFree(trainsetLabels); + cudaFree(datasetLabels); + cudaFree(clusterCentersTemp); + + _cuann_multi_device_free(wsKAC, handle->numDevices); + _cuann_multi_device_free(pqCentersTemp, handle->numDevices); + _cuann_multi_device_free(pqClusterSize, handle->numDevices); + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, handle->numDevices); + + cuannSetDevice(handle, cuannDevId); + _cuann_set_device(callerDevId); + + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSaveIndex +cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName) +{ + if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle->devId); + + FILE* fp = fopen(fileName, "w"); + if (fp == NULL) { + fprintf(stderr, "(%s) failed to open file (%s).\n", __func__, fileName); + return CUANN_STATUS_FILEIO_ERROR; + } + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)index; + fprintf(stderr, "(%s) indexSize: %lu\n", __func__, header->indexSize); + if (fwrite(index, 1, header->indexSize, fp) != header->indexSize) { + fprintf(stderr, "(%s) failed to save index to file (%s)\n", __func__, fileName); + return CUANN_STATUS_FILEIO_ERROR; + } + fclose(fp); + + _cuann_set_device(orgDevId); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqLoadIndex +cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName) +{ + if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle->devId); + + if (1 /* *index == NULL */) { + FILE* fp = fopen(fileName, "r"); + if (fp == NULL) { + fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName); + return CUANN_STATUS_FILEIO_ERROR; + } + size_t indexSize; + fread(&indexSize, sizeof(size_t), 1, fp); + fprintf(stderr, "(%s) indexSize: %lu\n", __func__, indexSize); + cudaError_t cudaError = cudaMallocManaged(index, indexSize); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s) cudaMallocManaged() failed.\n", __func__); + return CUANN_STATUS_ALLOC_FAILED; + } + fseek(fp, 0, SEEK_SET); + if (fread(*index, 1, indexSize, fp) != indexSize) { + fprintf(stderr, "(%s) failed to load index to from file (%s)\n", __func__, fileName); + return CUANN_STATUS_FILEIO_ERROR; + } + fclose(fp); + + cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle->devId); + } + + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(*index); + desc->numClusters = header->numClusters; + desc->numDataset = header->numDataset; + desc->dimDataset = header->dimDataset; + desc->dimPq = header->dimPq; + desc->similarity = (cuannSimilarity_t)header->similarity; + desc->maxClusterSize = header->maxClusterSize; + desc->dimRotDataset = header->dimRotDataset; + desc->lenPq = desc->dimRotDataset / desc->dimPq; + desc->bitPq = header->bitPq; + desc->typePqCenter = (cuannPqCenter_t)header->typePqCenter; + desc->dtypeDataset = (cudaDataType_t)header->dtypeDataset; + desc->dimDatasetExt = header->dimDatasetExt; + desc->indexVersion = header->version; + + float* clusterCenters; // [numClusters, dimDatasetExt] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* indexPtr; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + *index, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &indexPtr, + &rotationMatrix, + &clusterRotCenters); + + // + _cuann_get_inclusiveSumSortedClusterSize( + desc, indexPtr, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + + size_t size; + // pqDataset + size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; + if (size < (handle->deviceProp).totalGlobalMem) { + cudaMemPrefetchAsync(pqDataset, size, handle->devId); + } + // clusterCenters + size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; + cudaMemPrefetchAsync(clusterCenters, size, handle->devId); + // pqCenters + if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; + } else { + size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; + } + cudaMemPrefetchAsync(pqCenters, size, handle->devId); + // originalNumbers + size = sizeof(uint32_t) * desc->numDataset; + cudaMemPrefetchAsync(originalNumbers, size, handle->devId); + // indexPtr + size = sizeof(uint32_t) * (desc->numClusters + 1); + cudaMemPrefetchAsync(indexPtr, size, handle->devId); + // rotationMatrix + if (rotationMatrix != NULL) { + size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; + cudaMemPrefetchAsync(rotationMatrix, size, handle->devId); + } + // clusterRotCenters + if (clusterRotCenters != NULL) { + size = sizeof(float) * desc->numClusters * desc->dimRotDataset; + cudaMemPrefetchAsync(clusterRotCenters, size, handle->devId); + } + + _cuann_set_device(orgDevId); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex +cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( + cuannHandle_t handle, + const char* oldIndexFileName, + const char* newIndexFileName, + const void* newVectors, /* [numNewVectors, dimDataset] */ + uint32_t numNewVectors) +{ + cudaError_t cudaError; + cuannStatus_t ret; + if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, newVectors); + if (attr.type == cudaMemoryTypeDevice) { + fprintf(stderr, "(%s, %d) newVectors must be accessible from the host.\n", __func__, __LINE__); + return CUANN_STATUS_INVALID_POINTER; + } + int cuannDevId = handle->devId; + int callerDevId = _cuann_set_device(cuannDevId); + + // + // Load old index + // + cuannIvfPqDescriptor_t oldDesc; + ret = cuannIvfPqCreateDescriptor(&oldDesc); + if (ret != CUANN_STATUS_SUCCESS) { return ret; } + void* oldIndex; + ret = cuannIvfPqLoadIndex(handle, oldDesc, &oldIndex, oldIndexFileName); + if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cudaDataType_t dtype = oldDesc->dtypeDataset; + char dtypeString[64]; + fprintf(stderr, "(%s) dtype: %s\n", __func__, _cuann_get_dtype_string(dtype, dtypeString)); + fprintf(stderr, "(%s) dimDataset: %u\n", __func__, oldDesc->dimDataset); + struct cuannIvfPqIndexHeader* oldHeader; + float* oldClusterCenters; // [numClusters, dimDatasetExt] + float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* oldPqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* oldOriginalNumbers; // [numDataset] + uint32_t* oldIndexPtr; // [numClusters + 1] + float* oldRotationMatrix; // [dimDataset, dimRotDataset] + float* oldClusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(oldDesc, + oldIndex, + &oldHeader, + &oldClusterCenters, + &oldPqCenters, + &oldPqDataset, + &oldOriginalNumbers, + &oldIndexPtr, + &oldRotationMatrix, + &oldClusterRotCenters); + + // + // The clusterCenters stored in index contain data other than cluster + // centroids to speed up the search. Here, only the cluster centroids + // are extracted. + // + float* clusterCenters; // [numClusters, dimDataset] + cudaError = + cudaMallocManaged(&clusterCenters, sizeof(float) * oldDesc->numClusters * oldDesc->dimDataset); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + for (int i = 0; i < oldDesc->numClusters; i++) { + memcpy(clusterCenters + (uint64_t)i * oldDesc->dimDataset, + oldClusterCenters + (uint64_t)i * oldDesc->dimDatasetExt, + sizeof(float) * oldDesc->dimDataset); + } + + // + // Use the existing cluster centroids to find the label (cluster ID) + // of the vector to be added. + // + uint32_t* newVectorLabels; // [numNewVectors,] + cudaError = cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors); + uint32_t* clusterSize; // [numClusters,] + cudaError = cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters); + fprintf(stderr, "(%s) Predict label of new vectors\n", __func__); + _cuann_kmeans_predict_MP(handle->numDevices, + handle->cublasHandles, + clusterCenters, + oldDesc->numClusters, + oldDesc->dimDataset, + newVectors, + dtype, + numNewVectors, + newVectorLabels, + oldDesc->similarity, + true, + clusterSize, + false /* do not update clusterCenters */); + +#ifdef CUANN_DEBUG + if (1) { + const int _num_show = 10; + fprintf(stderr, "# numNewVectors: %u\n", numNewVectors); + fprintf(stderr, "# newVectorLabels: "); + for (int i = 0; i < numNewVectors; i++) { + if ((i < _num_show) || (numNewVectors - i <= _num_show)) { + fprintf(stderr, "%u, ", newVectorLabels[i]); + } else if (i == _num_show) { + fprintf(stderr, "..., "); + } + } + fprintf(stderr, "\n"); + } + if (1) { + const int _num_show = 10; + fprintf(stderr, "# oldDesc->numClusters: %u\n", oldDesc->numClusters); + fprintf(stderr, "# clusterSize: "); + int _sum = 0; + for (int i = 0; i < oldDesc->numClusters; i++) { + _sum += clusterSize[i]; + if ((i < _num_show) || (oldDesc->numClusters - i <= _num_show)) { + fprintf(stderr, "%u, ", clusterSize[i]); + } else if (i == _num_show) { + fprintf(stderr, "..., "); + } + } + fprintf(stderr, "\n"); + fprintf(stderr, "# _sum: %d\n", _sum); + } +#endif + + // + // Make indexPtr, originalNumbers + // + uint32_t maxClusterSize = 0; + uint32_t* indexPtr; // [numClusters + 1] + uint32_t* originalNumbers; // [numNewVectors] + indexPtr = (uint32_t*)malloc(sizeof(uint32_t) * (oldDesc->numClusters + 1)); + originalNumbers = (uint32_t*)malloc(sizeof(uint32_t) * numNewVectors); + // indexPtr + indexPtr[0] = 0; + for (uint32_t l = 0; l < oldDesc->numClusters; l++) { + indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; + maxClusterSize = max(maxClusterSize, clusterSize[l]); + } + if (indexPtr[oldDesc->numClusters] != numNewVectors) { + fprintf(stderr, "(%s, %d) Unexpected Error.\n", __func__, __LINE__); + return CUANN_STATUS_INTERNAL_ERROR; + } + // originalNumbers + for (uint32_t i = 0; i < numNewVectors; i++) { + uint32_t l = newVectorLabels[i]; + originalNumbers[indexPtr[l]] = i; + indexPtr[l] += 1; + } + // Recover indexPtr + for (uint32_t l = 0; l < oldDesc->numClusters; l++) { + indexPtr[l] -= clusterSize[l]; + } + + // + // Compute PQ code for new vectors + // + uint8_t* pqDataset; // [numNewVectors, dimPq * bitPq / 8] + cudaError = cudaMallocManaged( + &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); + return CUANN_STATUS_ALLOC_FAILED; + } + _cuann_compute_PQ_code(handle, + numNewVectors, + oldDesc->dimDataset, + oldDesc->dimRotDataset, + oldDesc->dimPq, + oldDesc->lenPq, + oldDesc->bitPq, + oldDesc->numClusters, + dtype, + oldDesc->typePqCenter, + maxClusterSize, + clusterCenters, + oldRotationMatrix, + newVectors, + originalNumbers, + clusterSize, + indexPtr, + oldPqCenters, + 0, + pqDataset); + cudaSetDevice(cuannDevId); + + // + // Create descriptor for new index + // + cuannIvfPqDescriptor_t newDesc; + ret = cuannIvfPqCreateDescriptor(&newDesc); + if (ret != CUANN_STATUS_SUCCESS) { return ret; } + memcpy(newDesc, oldDesc, sizeof(struct cuannIvfPqDescriptor)); + newDesc->numDataset += numNewVectors; + fprintf( + stderr, "(%s) numDataset: %u -> %u\n", __func__, oldDesc->numDataset, newDesc->numDataset); + + // + // Allocate memory for new index + // + size_t newIndexSize; + ret = cuannIvfPqGetIndexSize(newDesc, &newIndexSize); + if (ret != CUANN_STATUS_SUCCESS) { return ret; } + fprintf(stderr, "(%s) indexSize: %lu -> %lu\n", __func__, oldHeader->indexSize, newIndexSize); + void* newIndex = malloc(newIndexSize); + memset(newIndex, 0, newIndexSize); + struct cuannIvfPqIndexHeader* newHeader; + float* newClusterCenters; // [numClusters, dimDatasetExt] + float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* newPqDataset; // [numDataset, dimPq * bitPq / 8] *** + uint32_t* newOriginalNumbers; // [numDataset] *** + uint32_t* newIndexPtr; // [numClusters + 1] *** + float* newRotationMatrix; // [dimDataset, dimRotDataset] + float* newClusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(newDesc, + newIndex, + &newHeader, + &newClusterCenters, + &newPqCenters, + &newPqDataset, + &newOriginalNumbers, + &newIndexPtr, + &newRotationMatrix, + &newClusterRotCenters); + + // + // Copy the unchanged parts + // header, clusterCenters, pqCenters, rotationMatrix, clusterRotCenters + // + memcpy(newHeader, oldHeader, sizeof(struct cuannIvfPqIndexHeader)); + { + cuannIvfPqGetIndexSize(newDesc, &(newHeader->indexSize)); + newHeader->numDataset = newDesc->numDataset; + newHeader->numDatasetAdded += numNewVectors; + } + memcpy(newClusterCenters, oldClusterCenters, _cuann_getIndexSize_clusterCenters(oldDesc)); + memcpy(newPqCenters, oldPqCenters, _cuann_getIndexSize_pqCenters(oldDesc)); + memcpy(newRotationMatrix, oldRotationMatrix, _cuann_getIndexSize_rotationMatrix(oldDesc)); + memcpy( + newClusterRotCenters, oldClusterRotCenters, _cuann_getIndexSize_clusterRotCenters(oldDesc)); + + // + // Make newIndexPtr + // + maxClusterSize = 0; + newIndexPtr[0] = 0; + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; + newIndexPtr[l + 1] = newIndexPtr[l]; + newIndexPtr[l + 1] += oldClusterSize + clusterSize[l]; + maxClusterSize = max(maxClusterSize, oldClusterSize + clusterSize[l]); + } + { + newDesc->maxClusterSize = maxClusterSize; + newHeader->maxClusterSize = maxClusterSize; + } + fprintf(stderr, + "(%s) maxClusterSize: %u -> %u\n", + __func__, + oldDesc->maxClusterSize, + newDesc->maxClusterSize); + + // + // Make newOriginalNumbers + // + for (uint32_t i = 0; i < numNewVectors; i++) { + originalNumbers[i] += oldDesc->numDataset; + } + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; + memcpy(newOriginalNumbers + newIndexPtr[l], + oldOriginalNumbers + oldIndexPtr[l], + sizeof(uint32_t) * oldClusterSize); + memcpy(newOriginalNumbers + newIndexPtr[l] + oldClusterSize, + originalNumbers + indexPtr[l], + sizeof(uint32_t) * clusterSize[l]); + } + + // + // Make newPqDataset + // + size_t unitPqDataset = newDesc->dimPq * newDesc->bitPq / 8; + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; + memcpy(newPqDataset + unitPqDataset * newIndexPtr[l], + oldPqDataset + unitPqDataset * oldIndexPtr[l], + sizeof(uint8_t) * unitPqDataset * oldClusterSize); + memcpy(newPqDataset + unitPqDataset * (newIndexPtr[l] + oldClusterSize), + pqDataset + unitPqDataset * indexPtr[l], + sizeof(uint8_t) * unitPqDataset * clusterSize[l]); + } + + // + // Save new index + // + ret = cuannIvfPqSaveIndex(handle, newDesc, newIndex, newIndexFileName); + if (ret != CUANN_STATUS_SUCCESS) { return ret; } + if (newHeader->numDatasetAdded * 2 >= newHeader->numDataset) { + fprintf(stderr, + "(%s) The total number of vectors in the new index" + " is now more than twice the initial number of vectors." + " You may want to re-build the index from scratch." + " (numVectors: %u, numVectorsAdded: %u)\n", + __func__, + newHeader->numDataset, + newHeader->numDatasetAdded); + } + + // + // Finalize + // + cuannIvfPqDestroyDescriptor(oldDesc); + cuannIvfPqDestroyDescriptor(newDesc); + + free(originalNumbers); + free(indexPtr); + free(newIndex); + + cudaFree(pqDataset); + cudaFree(clusterSize); + cudaFree(newVectorLabels); + cudaFree(clusterCenters); + cudaFree(oldIndex); + + cuannSetDevice(handle, cuannDevId); + _cuann_set_device(callerDevId); + + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSetSearchParameters +cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numProbes, + const uint32_t topK) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (numProbes == 0) { + fprintf( + stderr, "(%s) numProbes must be larger than zero (numProbes:%u).\n", __func__, numProbes); + return CUANN_STATUS_INVALID_VALUE; + } + if (topK == 0) { + fprintf(stderr, "(%s) topK must be larger than zero (topK:%u).\n", __func__, topK); + return CUANN_STATUS_INVALID_VALUE; + } + if (numProbes > desc->numClusters) { + fprintf(stderr, + "(%s) numProbes must be smaller than or equal to numClusters (numProbes:%u, " + "numClusters:%u).\n", + __func__, + numProbes, + desc->numClusters); + return CUANN_STATUS_INVALID_VALUE; + } + if (topK > desc->numDataset) { + fprintf(stderr, + "(%s) topK must be smaller than or equal to numDataset (topK:%u, numDataset:%u).\n", + __func__, + topK, + desc->numDataset); + return CUANN_STATUS_INVALID_VALUE; + } + uint32_t numSamplesWorstCase = desc->numDataset; + if (numProbes < desc->numClusters) { + numSamplesWorstCase = + desc->numDataset - + desc->inclusiveSumSortedClusterSize[desc->numClusters - 1 - numProbes - + desc->_numClustersSize0]; // (*) urgent WA, need to be + // fixed. + } + if (topK > numSamplesWorstCase) { + fprintf(stderr, + "(%s) numProbes is too small to get topK results reliably (numProbes:%u, topK:%u, " + "numSamplesWorstCase:%u).\n", + __func__, + numProbes, + topK, + numSamplesWorstCase); + return CUANN_STATUS_INVALID_VALUE; + } + desc->numProbes = numProbes; + desc->topK = topK; + if (0) { + char dtypeString[64]; + fprintf( + stderr, "# dtypeDataset: %s\n", _cuann_get_dtype_string(desc->dtypeDataset, dtypeString)); + } + desc->maxSamples = desc->inclusiveSumSortedClusterSize[numProbes - 1]; + if (desc->maxSamples % 128) { desc->maxSamples += 128 - (desc->maxSamples % 128); } + desc->internalDistanceDtype = CUDA_R_32F; + desc->smemLutDtype = CUDA_R_32F; + desc->preferredThreadBlockSize = 0; + // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSetSearchParameters +cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (internalDistanceDtype != CUDA_R_16F && internalDistanceDtype != CUDA_R_32F) { + fprintf( + stderr, "(%s) internalDistanceDtype must be either CUDA_R_16F or CUDA_R_32F\n", __func__); + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + if (smemLutDtype != CUDA_R_16F && smemLutDtype != CUDA_R_32F && smemLutDtype != CUDA_R_8U) { + fprintf(stderr, "(%s) smemLutDtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U\n", __func__); + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + if (preferredThreadBlockSize != 256 && preferredThreadBlockSize != 512 && + preferredThreadBlockSize != 1024 && preferredThreadBlockSize != 0) { + fprintf(stderr, + "(%s) preferredThreadBlockSize must be 0, 256, 512 or 1024. %u is given.\n", + __func__, + preferredThreadBlockSize); + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + desc->internalDistanceDtype = internalDistanceDtype; + desc->smemLutDtype = smemLutDtype; + if (0) { + char dtypeString[64]; + fprintf(stderr, + "# internalDistanceDtype: %s\n", + _cuann_get_dtype_string(desc->internalDistanceDtype, dtypeString)); + } + desc->preferredThreadBlockSize = preferredThreadBlockSize; + // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqGetSearchParameters +cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + *numProbes = desc->numProbes; + *topK = desc->topK; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqGetSearchTuningParameters +cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize) +{ + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + *internalDistanceDtype = desc->internalDistanceDtype; + *smemLutDtype = desc->smemLutDtype; + *preferredThreadBlockSize = desc->preferredThreadBlockSize; + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSearch +cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t maxQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize) +{ + if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + + size_t max_ws = maxWorkspaceSize; + if (max_ws == 0) { + max_ws = (size_t)1 * 1024 * 1024 * 1024; // default, 1GB + } else { + max_ws = max(max_ws, (size_t)512 * 1024 * 1024); + } + + size_t size_0 = + _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + _cuann_aligned(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + _cuann_aligned(sizeof(uint32_t) * maxQueries * desc->numProbes) + // clusterLabels.. + _cuann_aligned(sizeof(float) * maxQueries * desc->numClusters) + // QCDistances + _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); + if (size_0 > max_ws) { + maxQueries = maxQueries * max_ws / size_0; + if (maxQueries > 32) { maxQueries -= (maxQueries % 32); } + // fprintf(stderr, "(%s) maxQueries is reduced to %u.\n", __func__, maxQueries); + } + // maxQueries = min(max(maxQueries, 1), 1024); + // maxQueries = min(max(maxQueries, 1), 2048); + maxQueries = min(max(maxQueries, 1), 4096); + desc->maxQueries = maxQueries; + + *workspaceSize = + _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + _cuann_aligned(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + _cuann_aligned(sizeof(uint32_t) * maxQueries * desc->numProbes); // clusterLabels.. + + max_ws -= *workspaceSize; + desc->maxBatchSize = 1; + while (1) { + uint32_t nextBatchSize = desc->maxBatchSize * max_ws / ivfpq_search_bufferSize(handle, desc); + if (desc->maxBatchSize >= nextBatchSize) break; + desc->maxBatchSize = nextBatchSize; + } + desc->maxBatchSize = min(max(desc->maxBatchSize, 1), maxQueries); + + if (maxQueries > desc->maxBatchSize) { + // Adjust maxBatchSize to reduce workspace size. + uint32_t num = (maxQueries + desc->maxBatchSize - 1) / desc->maxBatchSize; + if (1 < num && num < 5) { desc->maxBatchSize = (maxQueries + num - 1) / num; } + } + + if (1) { + // Adjust maxBatchSize to improve GPU occupancy of topk kernel. + uint32_t numCta_total = (handle->deviceProp).multiProcessorCount * 2; + uint32_t numCta_perBatch = numCta_total / desc->maxBatchSize; + float utilization = (float)numCta_perBatch * desc->maxBatchSize / numCta_total; + if (numCta_perBatch > 1 || (numCta_perBatch == 1 && utilization < 0.6)) { + uint32_t numCta_perBatch_1 = numCta_perBatch + 1; + uint32_t maxBatchSize_1 = numCta_total / numCta_perBatch_1; + float utilization_1 = (float)numCta_perBatch_1 * maxBatchSize_1 / numCta_total; + // fprintf(stderr, "# maxBatchSize :%u, utilization :%f\n", desc->maxBatchSize, + // utilization); fprintf(stderr, "# maxBatchSize_1:%u, utilization_1:%f\n", maxBatchSize_1, + // utilization_1); + if (utilization < utilization_1) { desc->maxBatchSize = maxBatchSize_1; } + } + } + + size_t size_1 = + _cuann_aligned(sizeof(float) * maxQueries * desc->numClusters) + // QCDistance + _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); + size_t size_2 = ivfpq_search_bufferSize(handle, desc); + *workspaceSize += max(size_1, size_2); + +#ifdef CUANN_DEBUG + fprintf(stderr, "# maxQueries: %u\n", maxQueries); + fprintf(stderr, "# maxBatchSize: %u\n", desc->maxBatchSize); + fprintf(stderr, + "# workspaceSize: %lu (%.3f GiB)\n", + *workspaceSize, + (float)*workspaceSize / 1024 / 1024 / 1024); +#endif + + return CUANN_STATUS_SUCCESS; +} + +// cuannIvfPqSearch +cuannStatus_t cuannIvfPqSearch( + cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const void* queries, // [numQueries, dimDataset], host or device pointer + cudaDataType_t dtype, + uint32_t numQueries, + uint64_t* neighbors, // [numQueries, topK], device pointer + float* distances, // [numQueries, topK], device pointer + void* workspace) +{ + if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle->devId); + + if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { + return CUANN_STATUS_UNSUPPORTED_DTYPE; + } + + struct cuannIvfPqIndexHeader* header; + float* clusterCenters; // [numClusters, dimDatasetExt] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* indexPtr; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + index, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &indexPtr, + &rotationMatrix, + &clusterRotCenters); + // + void* devQueries; // [maxQueries, dimDatasetExt] + float* curQueries; // [maxQueries, dimDatasetExt] + float* rotQueries; // [maxQueries, dimRotDataset] + uint32_t* clusterLabelsToProbe; // [maxQueries, numProbes] + float* QCDistances; // [maxQueries, numClusters] + void* topkWorkspace; + void* searchWorkspace; + devQueries = (void*)workspace; + curQueries = (float*)((uint8_t*)devQueries + + _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + rotQueries = (float*)((uint8_t*)curQueries + + _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + clusterLabelsToProbe = + (uint32_t*)((uint8_t*)rotQueries + + _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimRotDataset)); + // + QCDistances = (float*)((uint8_t*)clusterLabelsToProbe + + _cuann_aligned(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + topkWorkspace = (void*)((uint8_t*)QCDistances + + _cuann_aligned(sizeof(float) * desc->maxQueries * desc->numClusters)); + // + searchWorkspace = (void*)((uint8_t*)clusterLabelsToProbe + + _cuann_aligned(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + + void (*_ivfpq_search)(cuannHandle_t, + cuannIvfPqDescriptor_t, + uint32_t, + const float*, + const float*, + const uint8_t*, + const uint32_t*, + const uint32_t*, + const uint32_t*, + const float*, + uint64_t*, + float*, + void*); + if (desc->internalDistanceDtype == CUDA_R_16F) { + if (desc->smemLutDtype == CUDA_R_16F) { + _ivfpq_search = ivfpq_search; + } else if (desc->smemLutDtype == CUDA_R_8U) { + _ivfpq_search = ivfpq_search>; + } else { + _ivfpq_search = ivfpq_search; + } + } else { + if (desc->smemLutDtype == CUDA_R_16F) { + _ivfpq_search = ivfpq_search; + } else if (desc->smemLutDtype == CUDA_R_8U) { + _ivfpq_search = ivfpq_search>; + } else { + _ivfpq_search = ivfpq_search; + } + } + + cublasStatus_t cublasError; + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, neighbors); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + fprintf(stderr, "(%s) neighbors must be accessible from the device.\n", __func__); + return CUANN_STATUS_INVALID_POINTER; + } + cudaPointerGetAttributes(&attr, distances); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + fprintf(stderr, "(%s) distances must be accessible from the device.\n", __func__); + return CUANN_STATUS_INVALID_POINTER; + } + cudaPointerGetAttributes(&attr, queries); + +#ifdef CUANN_DEBUG + cudaError_t cudaError; +#endif + + for (uint32_t i = 0; i < numQueries; i += desc->maxQueries) { + uint32_t nQueries = min(desc->maxQueries, numQueries - i); + + float fillValue = 0.0; + if (desc->similarity == CUANN_SIMILARITY_L2) { fillValue = 1.0 / -2.0; } + float divisor = 1.0; + if (desc->dtypeDataset == CUDA_R_8U) { + divisor = 256.0; + } else if (desc->dtypeDataset == CUDA_R_8I) { + divisor = 128.0; + } + if (dtype == CUDA_R_32F) { + float* ptrQueries = (float*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(float) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle->stream); + ptrQueries = (float*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle->stream); + } else if (dtype == CUDA_R_8U) { + uint8_t* ptrQueries = (uint8_t*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(uint8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle->stream); + ptrQueries = (uint8_t*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle->stream); + } else if (dtype == CUDA_R_8I) { + int8_t* ptrQueries = (int8_t*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(int8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle->stream); + ptrQueries = (int8_t*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle->stream); + } + + float alpha; + float beta; + uint32_t gemmK = desc->dimDataset; + if (desc->similarity == CUANN_SIMILARITY_INNER) { + alpha = -1.0; + beta = 0.0; + } else { + alpha = -2.0; + beta = 0.0; + gemmK = desc->dimDataset + 1; + assert(gemmK <= desc->dimDatasetExt); + } + cublasError = cublasGemmEx(handle->cublasHandle, + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->numClusters, + nQueries, + gemmK, + &alpha, + clusterCenters, + CUDA_R_32F, + desc->dimDatasetExt, + curQueries, + CUDA_R_32F, + desc->dimDatasetExt, + &beta, + QCDistances, + CUDA_R_32F, + desc->numClusters, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUBLAS_ERROR; + } + + // Rotate queries + alpha = 1.0; + beta = 0.0; + cublasError = cublasGemmEx(handle->cublasHandle, + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->dimRotDataset, + nQueries, + desc->dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + desc->dimDataset, + curQueries, + CUDA_R_32F, + desc->dimDatasetExt, + &beta, + rotQueries, + CUDA_R_32F, + desc->dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + if (cublasError != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUBLAS_ERROR; + } + + // Select neighbor clusters for each query. + _cuann_find_topk(handle, + desc->numProbes, + nQueries, + desc->numClusters, + NULL, + QCDistances, + clusterLabelsToProbe, + topkWorkspace, + false); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } +#endif + // + for (uint32_t j = 0; j < nQueries; j += desc->maxBatchSize) { + uint32_t batchSize = min(desc->maxBatchSize, nQueries - j); + _ivfpq_search(handle, + desc, + batchSize, + clusterRotCenters, + pqCenters, + pqDataset, + originalNumbers, + indexPtr, + clusterLabelsToProbe + ((uint64_t)(desc->numProbes) * j), + rotQueries + ((uint64_t)(desc->dimRotDataset) * j), + neighbors + ((uint64_t)(desc->topK) * (i + j)), + distances + ((uint64_t)(desc->topK) * (i + j)), + searchWorkspace); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf( + stderr, "(%s, %d) cudaDeviceSynchronize() failed (%d)\n", __func__, __LINE__, cudaError); + fprintf(stderr, "# i:%u, nQueries:%u, j:%u, batchSize:%u\n", i, nQueries, j, batchSize); + return CUANN_STATUS_CUDA_ERROR; + } +#endif + } + } + +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + return CUANN_STATUS_CUDA_ERROR; + } +#endif + + _cuann_set_device(orgDevId); + return CUANN_STATUS_SUCCESS; +} + +} // namespace raft::spatial::knn::ivf_pq From ebc20beed10ec39f6670bfebf0b1a28c6bdbe6db Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 4 Aug 2022 15:41:10 +0200 Subject: [PATCH 002/140] Add missing functions and fix small compile errors --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 1303 +++++++++++++++++++++-- 1 file changed, 1225 insertions(+), 78 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index d7ead0fa6a..c00c99f813 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include "detail/ann_utils.cuh" + #include #include @@ -223,8 +225,9 @@ char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) } // -size_t _cuann_aligned(size_t size, size_t unit = 128) +size_t _cuann_aligned(size_t size) { + size_t unit = 128; if (size % unit) { size += unit - (size % unit); } return size; } @@ -245,41 +248,6 @@ void _cuann_memset(void* ptr, int value, size_t count) } } -// square sum along column -__global__ void kern_sqsum(uint32_t nRows, - uint32_t nCols, - const float* a, // [nRows, nCols] - float* out // [nRows] -) -{ - uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x); - if (iRow >= nRows) return; - - float sqsum = 0.0; - for (uint64_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { - float val = a[iCol + (nCols * iRow)]; - sqsum += val * val; - } - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); - if (threadIdx.x == 0) { out[iRow] = sqsum; } -} - -// square sum along column -void _cuann_sqsum(uint32_t nRows, - uint32_t nCols, - const float* a, // [numDataset, dimDataset] - float* out // [numDataset,] -) -{ - dim3 threads(32, 4, 1); // DO NOT CHANGE - dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1); - kern_sqsum<<>>(nRows, nCols, a, out); -} - // outer add __global__ void kern_outer_add(const float* a, uint32_t numA, @@ -297,19 +265,6 @@ __global__ void kern_outer_add(const float* a, c[gid] = valA + valB; } -// outer add -void _cuann_outer_add(const float* a, - uint32_t numA, - const float* b, - uint32_t numB, - float* c // [numA, numB] -) -{ - dim3 threads(128, 1, 1); - dim3 blocks(((uint64_t)numA * numB + threads.x - 1) / threads.x, 1, 1); - kern_outer_add<<>>(a, numA, b, numB, c); -} - // argmin along column __global__ void kern_argmin(uint32_t nRows, uint32_t nCols, @@ -652,8 +607,6 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput, if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } cudaPointerGetAttributes(&attr, input); if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } - // _cuann_memset(output, 0, sizeof(float) * nRowsOutput * nCols); - // _cuann_memset(count, 0, sizeof(uint32_t) * nRowsOutput); if (useGPU) { // GPU @@ -1001,9 +954,13 @@ void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle, alpha = -1.0; beta = 0.0; } else { - _cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters); - _cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset); - _cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances); + detail::utils::dots_along_rows( + numCenters, dimCenters, centers, sqsumCenters, rmm::cuda_stream_default); + detail::utils::dots_along_rows( + numDataset, dimDataset, dataset, sqsumDataset, rmm::cuda_stream_default); + + detail::utils::outer_add( + sqsumDataset, numDataset, sqsumCenters, numCenters, distances, rmm::cuda_stream_default); alpha = -2.0; beta = 1.0; } @@ -1534,7 +1491,7 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] uint32_t i = 0; uint32_t count = 0; for (uint32_t l = 0; l < numCenters; l++) { - if (clusterSize[l] > (int)(average * threshold)) continue; + if (clusterSize[l] > (uint32_t)(average * threshold)) continue; do { i = (i + ofst) % numDataset; } while (clusterSize[labels[i]] < average); @@ -1681,7 +1638,7 @@ __device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, i // template -__launch_bounds__(1024, 2) __global__ +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ void kern_topk_cg_11(uint32_t topk, uint32_t size_batch, uint32_t max_len_x, @@ -1931,7 +1888,7 @@ __launch_bounds__(1024, 2) __global__ // template -__launch_bounds__(1024, 2) __global__ +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ void kern_topk_cta_11(uint32_t topk, uint32_t size_batch, uint32_t max_len_x, @@ -2239,7 +2196,7 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i // template -__launch_bounds__(1024, 2) __global__ +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ void kern_topk_cg_8(uint32_t topk, uint32_t size_batch, uint32_t max_len_x, @@ -2410,7 +2367,7 @@ __launch_bounds__(1024, 2) __global__ // template -__launch_bounds__(1024, 2) __global__ +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ void kern_topk_cta_8(uint32_t topk, uint32_t size_batch, uint32_t max_len_x, @@ -3597,7 +3554,7 @@ void _cuann_get_inclusiveSumSortedClusterSize( // [CPU] *output = (uint32_t*)malloc(sizeof(uint32_t) * desc->numClusters); desc->_numClustersSize0 = 0; - for (int i = 0; i < desc->numClusters; i++) { + for (uint32_t i = 0; i < desc->numClusters; i++) { (*output)[i] = indexPtr[i + 1] - indexPtr[i]; if ((*output)[i] > 0) continue; @@ -3625,7 +3582,7 @@ void _cuann_get_inclusiveSumSortedClusterSize( // sort qsort(*output, desc->numClusters, sizeof(uint32_t), descending); // scan - for (int i = 1; i < desc->numClusters; i++) { + for (uint32_t i = 1; i < desc->numClusters; i++) { (*output)[i] += (*output)[i - 1]; } assert((*output)[desc->numClusters - 1] == desc->numDataset); @@ -3643,7 +3600,8 @@ void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); exit(-1); } - _cuann_sqsum(desc->numClusters, desc->dimDataset, clusterCenters, *output); + detail::utils::dots_along_rows( + desc->numClusters, desc->dimDataset, clusterCenters, *output, rmm::cuda_stream_default); } // @@ -3691,42 +3649,42 @@ void _cuann_make_rotation_matrix(uint32_t nRows, double dot, norm; double* matrix = (double*)malloc(sizeof(double) * nRows * nCols); memset(matrix, 0, sizeof(double) * nRows * nCols); - for (int i = 0; i < nRows * nCols; i++) { + for (uint32_t i = 0; i < nRows * nCols; i++) { matrix[i] = _cuann_rand() - 0.5; } - for (int j = 0; j < nCols; j++) { + for (uint32_t j = 0; j < nCols; j++) { // normalize the j-th col vector norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); - for (int i = 0; i < nRows; i++) { + for (uint32_t i = 0; i < nRows; i++) { matrix[j + (nCols * i)] /= norm; } // orthogonalize the j-th col vector with the previous col vectors - for (int k = 0; k < j; k++) { + for (uint32_t k = 0; k < j; k++) { dot = _cuann_dot(nRows, matrix + j, nCols, matrix + k, nCols); - for (int i = 0; i < nRows; i++) { + for (uint32_t i = 0; i < nRows; i++) { matrix[j + (nCols * i)] -= dot * matrix[k + (nCols * i)]; } } // normalize the j-th col vector again norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); - for (int i = 0; i < nRows; i++) { + for (uint32_t i = 0; i < nRows; i++) { matrix[j + (nCols * i)] /= norm; } } - for (int i = 0; i < nRows * nCols; i++) { + for (uint32_t i = 0; i < nRows * nCols; i++) { rotationMatrix[i] = (float)matrix[i]; } free(matrix); } else { if (nRows == nCols) { memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); - for (int i = 0; i < nCols; i++) { + for (uint32_t i = 0; i < nCols; i++) { rotationMatrix[i + (nCols * i)] = 1.0; } } else { memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); - int i = 0; - for (int j = 0; j < nCols; j++) { + uint32_t i = 0; + for (uint32_t j = 0; j < nCols; j++) { rotationMatrix[j + (nCols * i)] = 1.0; i += lenPq; if (i >= nRows) { i = (i % nRows) + 1; } @@ -3740,7 +3698,7 @@ void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenter uint32_t numCenters, uint32_t dimCenters, const uint32_t* centerSize, - const int numShow = 5) + const uint32_t numShow = 5) { for (uint64_t k = 0; k < numCenters; k++) { if ((numShow <= k) && (k < numCenters - numShow)) { @@ -3763,7 +3721,7 @@ void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenter void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] uint32_t numDataset, uint32_t dimDataset, - const int numShow = 5) + const uint32_t numShow = 5) { for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { @@ -3786,7 +3744,7 @@ void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] uint32_t numDataset, uint32_t dimPq, - const int numShow = 5) + const uint32_t numShow = 5) { for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { @@ -5351,7 +5309,7 @@ cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); return CUANN_STATUS_ALLOC_FAILED; } - for (int i = 0; i < oldDesc->numClusters; i++) { + for (uint32_t i = 0; i < oldDesc->numClusters; i++) { memcpy(clusterCenters + (uint64_t)i * oldDesc->dimDataset, oldClusterCenters + (uint64_t)i * oldDesc->dimDatasetExt, sizeof(float) * oldDesc->dimDataset); @@ -5395,7 +5353,7 @@ cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const int _num_show = 10; fprintf(stderr, "# numNewVectors: %u\n", numNewVectors); fprintf(stderr, "# newVectorLabels: "); - for (int i = 0; i < numNewVectors; i++) { + for (uint32_t i = 0; i < numNewVectors; i++) { if ((i < _num_show) || (numNewVectors - i <= _num_show)) { fprintf(stderr, "%u, ", newVectorLabels[i]); } else if (i == _num_show) { @@ -5409,7 +5367,7 @@ cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( fprintf(stderr, "# oldDesc->numClusters: %u\n", oldDesc->numClusters); fprintf(stderr, "# clusterSize: "); int _sum = 0; - for (int i = 0; i < oldDesc->numClusters; i++) { + for (uint32_t i = 0; i < oldDesc->numClusters; i++) { _sum += clusterSize[i]; if ((i < _num_show) || (oldDesc->numClusters - i <= _num_show)) { fprintf(stderr, "%u, ", clusterSize[i]); @@ -6146,4 +6104,1193 @@ cuannStatus_t cuannIvfPqSearch( return CUANN_STATUS_SUCCESS; } +// +template +__device__ inline float ivfpq_compute_score( + uint32_t dimPq, + uint32_t iDataset, + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const smemLutDtype* preCompScores, // [dimPq, 1 << bitPq] + bool earlyStop, + float kth_score = FLT_MAX) +{ + float score = 0.0; + constexpr uint32_t bitT = sizeof(T) * 8; + const T* headPqDataset = (T*)(pqDataset + (uint64_t)iDataset * (dimPq * bitPq / 8)); + for (int j = 0; j < dimPq / vecLen; j += 1) { + T pqCode = headPqDataset[0]; + headPqDataset += 1; + uint32_t bitLeft = bitT; +#pragma unroll vecLen + for (int k = 0; k < vecLen; k += 1) { + uint8_t code = pqCode; + if (bitLeft > bitPq) { + // This condition is always true here (to make the compiler happy) + if constexpr (bitT > bitPq) { pqCode >>= bitPq; } + bitLeft -= bitPq; + } else { + if (k < vecLen - 1) { + pqCode = headPqDataset[0]; + headPqDataset += 1; + } + code |= (pqCode << bitLeft); + pqCode >>= (bitPq - bitLeft); + bitLeft += (bitT - bitPq); + } + code &= (1 << bitPq) - 1; + score += (float)preCompScores[code]; + preCompScores += (1 << bitPq); + + if (earlyStop && (vecLen > 8) && ((k % 8) == 0)) { + if (score > kth_score) { return FLT_MAX; } + } + } + if (earlyStop && (vecLen <= 8)) { + if (score > kth_score) { return FLT_MAX; } + } + } + return score; +} + +// +template +__device__ inline void warp_merge(K& key, bool acending = true, int group_size = 32) +{ + int lane_id = threadIdx.x % 32; + for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { + bool direction = ((lane_id & mask) == 0); + K opp_key = __shfl_xor_sync(0xffffffff, key, mask); + if ((acending == direction) == (key > opp_key)) { key = opp_key; } + } +} + +// +template +__device__ inline void warp_merge(K& key, V& val, bool acending = true, int group_size = 32) +{ + int lane_id = threadIdx.x % 32; + for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { + bool direction = ((lane_id & mask) == 0); + K opp_key = __shfl_xor_sync(0xffffffff, key, mask); + V opp_val = __shfl_xor_sync(0xffffffff, val, mask); + if ((acending == direction) == ((key > opp_key) || ((key == opp_key) && (val > opp_val)))) { + key = opp_key; + val = opp_val; + } + } +} + +// +template +__device__ inline void warp_sort(K& key, bool acending = true) +{ + int lane_id = threadIdx.x % 32; + for (int group_size = 2; group_size <= 32; group_size <<= 1) { + bool direction = ((lane_id & group_size) == 0); + if ((group_size == 32) && (!acending)) { direction = !direction; } + warp_merge(key, direction, group_size); + } +} + +// +template +__device__ inline void warp_sort(K& key, V& val, bool acending = true) +{ + int lane_id = threadIdx.x % 32; + for (int group_size = 2; group_size <= 32; group_size <<= 1) { + bool direction = ((lane_id & group_size) == 0); + if ((group_size == 32) && (!acending)) { direction = !direction; } + warp_merge(key, val, direction, group_size); + } +} + +// +template +__device__ inline void swap(T& val1, T& val2) +{ + T val0 = val1; + val1 = val2; + val2 = val0; +} + +// +template +__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) +{ + if ((key1 > key2) || ((key1 == key2) && (val1 > val2))) { + swap(key1, key2); + swap(val1, val2); + return true; + } + return false; +} + +// +template +__device__ inline bool swap_if_needed(K& key1, K& key2) +{ + if (key1 > key2) { + swap(key1, key2); + return true; + } + return false; +} + +// +template +__device__ inline T max_value_of(); +template <> +__device__ inline float max_value_of() +{ + return FLT_MAX; +} +template <> +__device__ inline uint32_t max_value_of() +{ + return ~0u; +} + +// +template +class BlockTopk { + public: + __device__ BlockTopk(uint32_t topk, K* ptr_kth_key) : _topk(topk), _lane_id(threadIdx.x % 32) + { +#pragma unroll + for (int i = 0; i < depth; i++) { + _key[i] = max_value_of(); + _val[i] = max_value_of(); + } + _nfill = 0; + _init_buf(); + _ptr_kth_key = ptr_kth_key; + if (_ptr_kth_key) { + _kth_key = _ptr_kth_key[0]; + } else { + _kth_key = max_value_of(); + } + // __syncthreads(); + } + + __device__ inline K key(int i) { return _key[i]; } + + __device__ inline V val(int i) { return _val[i]; } + + __device__ inline K kth_key() { return _kth_key; } + + __device__ void add(K key, V val) + { + uint32_t mask = __ballot_sync(0xffffffff, (key < _kth_key)); + if (mask == 0) { return; } + uint32_t nvalid = __popc(mask); + if (_buf_nvalid + nvalid > 32) { + _add(_buf_key, _buf_val); + _init_buf(); + if (_ptr_kth_key) { _kth_key = min(_kth_key, _ptr_kth_key[0]); } + } + _push_buf(key, val, mask, nvalid); + } + + __device__ void finalize() + { + if (_buf_nvalid > 0) { _add(_buf_key, _buf_val); } + _merge(); + } + + protected: + K _key[depth]; + V _val[depth]; + K* _ptr_kth_key; + K _kth_key; + uint32_t _nfill; // 0 <= _nfill <= depth + K _buf_key; + V _buf_val; + uint32_t _buf_nvalid; // 0 <= _buf_nvalid <= 32 + + const uint32_t _topk; + const uint32_t _lane_id; + + __device__ inline void _init_buf() + { + _buf_nvalid = 0; + _buf_key = max_value_of(); + _buf_val = max_value_of(); + } + + __device__ inline void _adjust_nfill() + { +#pragma unroll + for (int j = 1; j < depth; j++) { + if (_nfill == depth - j + 1) { + if (__shfl_sync(0xffffffff, _key[depth - j], 0) <= _kth_key) { return; } + _nfill = depth - j; + } + } + } + + __device__ inline void _push_buf(K key, V val, uint32_t mask, uint32_t nvalid) + { + int i = 0; + if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { + int j = _lane_id - _buf_nvalid; + while (j > 0) { + i = __ffs(mask) - 1; + mask ^= (0x1u << i); + j -= 1; + } + i = __ffs(mask) - 1; + } + K temp_key = __shfl_sync(0xffffffff, key, i); + K temp_val = __shfl_sync(0xffffffff, val, i); + if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { + _buf_key = temp_key; + _buf_val = temp_val; + } + _buf_nvalid += nvalid; + } + + __device__ inline void _add(K key, V val) + { + if (_nfill == 0) { + warp_sort(key, val); + _key[0] = key; + _val[0] = val; + } else if (_nfill == 1) { + warp_sort(key, val, false); + swap_if_needed(_key[0], key, _val[0], val); + if (depth > 1) { + _key[1] = key; + _val[1] = val; + warp_merge(_key[1], _val[1]); + } + warp_merge(_key[0], _val[0]); + } else if ((depth >= 2) && (_nfill == 2)) { + warp_sort(key, val, false); + swap_if_needed(_key[1], key, _val[1], val); + if (depth > 2) { + _key[2] = key; + _val[2] = val; + warp_merge(_key[2], _val[2]); + } + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if ((depth >= 3) && (_nfill == 3)) { + warp_sort(key, val, false); + swap_if_needed(_key[2], key, _val[2], val); + if (depth > 3) { + _key[3] = key; + _val[3] = val; + warp_merge(_key[3], _val[3]); + } + warp_merge(_key[2], _val[2], false); + swap_if_needed(_key[1], _key[2], _val[1], _val[2]); + warp_merge(_key[2], _val[2]); + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if ((depth >= 4) && (_nfill == 4)) { + warp_sort(key, val, false); + swap_if_needed(_key[3], key, _val[3], val); + warp_merge(_key[3], _val[3], false); + swap_if_needed(_key[2], _key[3], _val[2], _val[3]); + warp_merge(_key[3], _val[3]); + warp_merge(_key[2], _val[2], false); + swap_if_needed(_key[1], _key[2], _val[1], _val[2]); + warp_merge(_key[2], _val[2]); + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } + _nfill = min(_nfill + 1, depth); + if (_nfill == depth) { + _kth_key = + min(_kth_key, __shfl_sync(0xffffffff, _key[depth - 1], _topk - 1 - (depth - 1) * 32)); + } + } + + __device__ inline void _merge() + { + uint32_t warp_id = threadIdx.x / 32; + uint32_t num_warps = blockDim.x / 32; + K* smem_key = smemArray; + V* smem_val = (V*)(smem_key + (blockDim.x / 2) * depth); + for (int j = num_warps / 2; j > 0; j /= 2) { + __syncthreads(); + if ((j <= warp_id) && (warp_id < (j * 2))) { + uint32_t opp_tid = threadIdx.x - (j * 32); + smem_key[opp_tid] = _key[0]; + smem_val[opp_tid] = _val[0]; + if (depth >= 2) { + smem_key[opp_tid + (j * 32)] = _key[1]; + smem_val[opp_tid + (j * 32)] = _val[1]; + } + if (depth >= 3) { + smem_key[opp_tid + (j * 32) * 2] = _key[2]; + smem_val[opp_tid + (j * 32) * 2] = _val[2]; + } + if (depth >= 4) { + smem_key[opp_tid + (j * 32) * 3] = _key[3]; + smem_val[opp_tid + (j * 32) * 3] = _val[3]; + } + } + __syncthreads(); + if (warp_id < j) { + K key; + V val; + if (depth == 1) { + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[0], key, _val[0], val); + + warp_merge(_key[0], _val[0]); + } else if (depth == 2) { + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[0], key, _val[0], val); + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[1], key, _val[1], val); + + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if (depth == 3) { + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; + swap_if_needed(_key[1], key, _val[1], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[2], key, _val[2], val); + K _key_3_ = smem_key[threadIdx.x ^ 31]; + V _val_3_ = smem_val[threadIdx.x ^ 31]; + + swap_if_needed(_key[0], _key[2], _val[0], _val[2]); + swap_if_needed(_key[1], _key_3_, _val[1], _val_3_); + swap_if_needed(_key[2], _key_3_, _val[2], _val_3_); + warp_merge(_key[2], _val[2]); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if (depth == 4) { + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 3]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 3]; + swap_if_needed(_key[0], key, _val[0], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; + swap_if_needed(_key[1], key, _val[1], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[2], key, _val[2], val); + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[3], key, _val[3], val); + + swap_if_needed(_key[0], _key[2], _val[0], _val[2]); + swap_if_needed(_key[1], _key[3], _val[1], _val[3]); + swap_if_needed(_key[2], _key[3], _val[2], _val[3]); + warp_merge(_key[3], _val[3]); + warp_merge(_key[2], _val[2]); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } + } + } + } +}; + +// +template +__device__ inline void update_approx_global_score(uint32_t topk, + K* my_score, + K* approx_global_score) +{ + if (!__any_sync(0xffffffff, (my_score[0] < approx_global_score[topk - 1]))) { return; } + if (topk <= 32) { + K score = max_value_of(); + if (threadIdx.x < topk) { score = approx_global_score[threadIdx.x]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + + warp_merge(my_score[0]); + if (threadIdx.x < topk) { atomicMin(approx_global_score + threadIdx.x, my_score[0]); } + } else if (topk <= 64) { + K score = max_value_of(); + if (threadIdx.x + 32 < topk) { score = approx_global_score[threadIdx.x + 32]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + swap_if_needed(my_score[1], score); + + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + if (threadIdx.x + 32 < topk) { atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); } + } else if (topk <= 96) { + K score = max_value_of(); + if (threadIdx.x + 64 < topk) { score = approx_global_score[threadIdx.x + 64]; } + warp_sort(score, false); + swap_if_needed(my_score[1], score); + score = approx_global_score[threadIdx.x + 32]; + warp_sort(score, false); + swap_if_needed(my_score[2], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + K my_score_3_ = score; + + swap_if_needed(my_score[0], my_score[2]); + swap_if_needed(my_score[1], my_score_3_); + swap_if_needed(my_score[2], my_score_3_); + warp_merge(my_score[2]); + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); + if (threadIdx.x + 64 < topk) { atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); } + } else if (topk <= 128) { + K score = max_value_of(); + if (threadIdx.x + 96 < topk) { score = approx_global_score[threadIdx.x + 96]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + score = approx_global_score[threadIdx.x + 64]; + warp_sort(score, false); + swap_if_needed(my_score[1], score); + score = approx_global_score[threadIdx.x + 32]; + warp_sort(score, false); + swap_if_needed(my_score[2], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + swap_if_needed(my_score[3], score); + + swap_if_needed(my_score[0], my_score[2]); + swap_if_needed(my_score[1], my_score[3]); + swap_if_needed(my_score[2], my_score[3]); + warp_merge(my_score[3]); + warp_merge(my_score[2]); + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); + atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); + if (threadIdx.x + 96 < topk) { atomicMin(approx_global_score + threadIdx.x + 96, my_score[3]); } + } +} + +// +template +__device__ inline outDtype get_out_score(float score, cuannSimilarity_t similarity) +{ + if (similarity == CUANN_SIMILARITY_INNER) { score = score / 2.0 - 1.0; } + if (sizeof(outDtype) == 2) { score = min(score, FP16_MAX); } + return (outDtype)score; +} + +// +// (*) Restrict the peak GPU occupancy up-to 50% by "__launch_bounds__(1024, 1)", +// as there were cases where performance dropped by a factor of two or more on V100 +// when the peak GPU occupancy was set to more than 50%. +// +template +__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( + uint32_t numDataset, + uint32_t dimDataset, + uint32_t numProbes, + uint32_t dimPq, + uint32_t sizeBatch, + uint32_t maxSamples, + cuannSimilarity_t similarity, + cuannPqCenter_t typePqCenter, + uint32_t topk, + const float* clusterCenters, // [numClusters, dimDataset,] + const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or + // [numClusetrs, 1 << bitPq, lenPq,] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] + const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] + const float* _query, // [sizeBatch, dimDataset,] + const uint32_t* indexList, // [sizeBatch * numProbes] + float* _preCompScores, // [...] + float* _topkScores, // [sizeBatch, topk] + outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] + uint32_t* _topkIndex // [sizeBatch, numProbes, topk] +) +{ + const uint32_t lenPq = dimDataset / dimPq; + float* smem = smemArray; + + smemLutDtype* preCompScores = (smemLutDtype*)smem; + float* baseDiff = NULL; + if (preCompBaseDiff) { baseDiff = (float*)(preCompScores + (dimPq << bitPq)); } + bool manageLocalTopk = false; + if (_topkIndex != NULL) { manageLocalTopk = true; } + + uint32_t iBatch; + uint32_t iProbe; + if (indexList == NULL) { + // iBatch = blockIdx.x / numProbes; + // iProbe = blockIdx.x % numProbes; + iBatch = blockIdx.x % sizeBatch; + iProbe = blockIdx.x / sizeBatch; + } else { + iBatch = indexList[blockIdx.x] / numProbes; + iProbe = indexList[blockIdx.x] % numProbes; + } + if (iBatch >= sizeBatch || iProbe >= numProbes) return; + + const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); + const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + const float* query = _query + (dimDataset * iBatch); + outDtype* output; + uint32_t* topkIndex = NULL; + float* approx_global_score = NULL; + if (manageLocalTopk) { + // Store topk calculated distances to output (and its indices to topkIndex) + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); + approx_global_score = _topkScores + (topk * iBatch); + } else { + // Store all calculated distances to output + output = _output + (maxSamples * iBatch); + } + uint32_t label = clusterLabels[iProbe]; + const float* myClusterCenter = clusterCenters + (dimDataset * label); + const float* myPqCenters; + if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + myPqCenters = pqCenters; + } else { + myPqCenters = pqCenters + (lenPq << bitPq) * label; + } + + if (preCompBaseDiff) { + // Reduce computational complexity by pre-computing the difference + // between the cluster centroid and the query. + for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { + baseDiff[i] = query[i] - myClusterCenter[i]; + } + __syncthreads(); + } + + // Create a lookup table + for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { + uint32_t iPq = i >> bitPq; + uint32_t iCode = i & ((1 << bitPq) - 1); + float score = 0.0; + for (uint32_t j = 0; j < lenPq; j++) { + uint32_t k = j + (lenPq * iPq); + float diff; + if (preCompBaseDiff) { + diff = baseDiff[k]; + } else { + diff = query[k] - myClusterCenter[k]; + } + if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + diff -= myPqCenters[j + (lenPq * i)]; + } else { + diff -= myPqCenters[j + (lenPq * iCode)]; + } + score += diff * diff; + } + preCompScores[i] = (smemLutDtype)score; + } + + uint32_t iSampleBase = 0; + if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } + uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; + uint32_t nSamples32 = nSamples; + if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } + uint32_t iDatasetBase = clusterIndexPtr[label]; + + BlockTopk block_topk( + topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + __syncthreads(); + + // Compute a distance for each sample + for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { + float score = FLT_MAX; + if (i < nSamples) { + score = ivfpq_compute_score( + dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + } + if (!manageLocalTopk) { + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, similarity); } + } else { + uint32_t val = i; + block_topk.add(score, val); + } + } + if (!manageLocalTopk) { return; } + block_topk.finalize(); + + // Output topk score and index + uint32_t warp_id = threadIdx.x / 32; + if (warp_id == 0) { + for (int j = 0; j < depth; j++) { + if (threadIdx.x + (32 * j) < topk) { + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), similarity); + topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; + } + } + } + + // Approximate update of global topk entries + if (warp_id == 0) { + float my_score[depth]; + for (int j = 0; j < depth; j++) { + my_score[j] = block_topk.key(j); + } + update_approx_global_score(topk, my_score, approx_global_score); + } +} + +// +template +__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( + uint32_t numDataset, + uint32_t dimDataset, + uint32_t numProbes, + uint32_t dimPq, + uint32_t sizeBatch, + uint32_t maxSamples, + cuannSimilarity_t similarity, + cuannPqCenter_t typePqCenter, + uint32_t topk, + const float* clusterCenters, // [numClusters, dimDataset,] + const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or + // [numClusetrs, 1 << bitPq, lenPq,] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] + const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] + const float* _query, // [sizeBatch, dimDataset,] + const uint32_t* indexList, // [sizeBatch * numProbes] + float* _preCompScores, // [..., dimPq << bitPq,] + float* _topkScores, // [sizeBatch, topk] + outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] + uint32_t* _topkIndex // [sizeBatch, numProbes, topk] +) +{ + const uint32_t lenPq = dimDataset / dimPq; + + float* preCompScores = _preCompScores + ((dimPq << bitPq) * blockIdx.x); + float* baseDiff = NULL; + if (preCompBaseDiff) { baseDiff = (float*)smemArray; } + bool manageLocalTopk = false; + if (_topkIndex != NULL) { manageLocalTopk = true; } + + for (int ib = blockIdx.x; ib < sizeBatch * numProbes; ib += gridDim.x) { + uint32_t iBatch; + uint32_t iProbe; + if (indexList == NULL) { + // iBatch = ib / numProbes; + // iProbe = ib % numProbes; + iBatch = ib % sizeBatch; + iProbe = ib / sizeBatch; + } else { + iBatch = indexList[ib] / numProbes; + iProbe = indexList[ib] % numProbes; + } + + const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); + const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + const float* query = _query + (dimDataset * iBatch); + outDtype* output; + uint32_t* topkIndex = NULL; + float* approx_global_score = NULL; + if (manageLocalTopk) { + // Store topk calculated distances to output (and its indices to topkIndex) + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); + approx_global_score = _topkScores + (topk * iBatch); + } else { + // Store all calculated distances to output + output = _output + (maxSamples * iBatch); + } + uint32_t label = clusterLabels[iProbe]; + const float* myClusterCenter = clusterCenters + (dimDataset * label); + const float* myPqCenters; + if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + myPqCenters = pqCenters; + } else { + myPqCenters = pqCenters + (lenPq << bitPq) * label; + } + + if (preCompBaseDiff) { + // Reduce computational complexity by pre-computing the difference + // between the cluster centroid and the query. + for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { + baseDiff[i] = query[i] - myClusterCenter[i]; + } + __syncthreads(); + } + + // Create a lookup table + for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { + uint32_t iPq = i >> bitPq; + uint32_t iCode = i & ((1 << bitPq) - 1); + float score = 0.0; + for (uint32_t j = 0; j < lenPq; j++) { + uint32_t k = j + (lenPq * iPq); + float diff; + if (preCompBaseDiff) { + diff = baseDiff[k]; + } else { + diff = query[k] - myClusterCenter[k]; + } + if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { + diff -= myPqCenters[j + (lenPq * i)]; + } else { + diff -= myPqCenters[j + (lenPq * iCode)]; + } + score += diff * diff; + } + preCompScores[i] = score; + } + + uint32_t iSampleBase = 0; + if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } + uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; + uint32_t nSamples32 = nSamples; + if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } + uint32_t iDatasetBase = clusterIndexPtr[label]; + + BlockTopk block_topk( + topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + __syncthreads(); + + // Compute a distance for each sample + for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { + float score = FLT_MAX; + if (i < nSamples) { + score = ivfpq_compute_score( + dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + } + if (!manageLocalTopk) { + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, similarity); } + } else { + uint32_t val = i; + block_topk.add(score, val); + } + } + __syncthreads(); + if (!manageLocalTopk) { + continue; // for (int ib ...) + } + block_topk.finalize(); + + // Output topk score and index + uint32_t warp_id = threadIdx.x / 32; + if (warp_id == 0) { + for (int j = 0; j < depth; j++) { + if (threadIdx.x + (32 * j) < topk) { + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), similarity); + topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; + } + } + } + + // Approximate update of global topk entries + if (warp_id == 0) { + float my_score[depth]; + for (int j = 0; j < depth; j++) { + my_score[j] = block_topk.key(j); + } + update_approx_global_score(topk, my_score, approx_global_score); + } + __syncthreads(); + } +} + +// search +template +void ivfpq_search(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimRotDataset] + const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [numQueries, dimRotDataset] + uint64_t* topkNeighbors, // [numQueries, topK] + float* topkDistances, // [numQueries, topK] + void* workspace) +{ + assert(numQueries <= desc->maxBatchSize); + + uint32_t* clusterLabelsOut; // [maxBatchSize, numProbes] + uint32_t* indexList; // [maxBatchSize * numProbes] + uint32_t* indexListSorted; // [maxBatchSize * numProbes] + uint32_t* numSamples; // [maxBatchSize,] + void* cubWorkspace; // ... + uint32_t* chunkIndexPtr; // [maxBatchSize, numProbes] + uint32_t* topkSids; // [maxBatchsize, topk] + scoreDtype* similarity; // [maxBatchSize, maxSamples] or + // [maxBatchSize, numProbes, topk] + uint32_t* simTopkIndex; // [maxBatchSize, numProbes, topk] + float* topkScores; // [maxBatchSize, topk] + float* preCompScores = NULL; + void* topkWorkspace; + + cudaError_t cudaError; + + clusterLabelsOut = (uint32_t*)workspace; + indexList = (uint32_t*)((uint8_t*)clusterLabelsOut + + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + indexListSorted = + (uint32_t*)((uint8_t*)indexList + + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + numSamples = (uint32_t*)((uint8_t*)indexListSorted + + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + cubWorkspace = + (void*)((uint8_t*)numSamples + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize)); + chunkIndexPtr = (uint32_t*)((uint8_t*)cubWorkspace + desc->sizeCubWorkspace); + topkSids = (uint32_t*)((uint8_t*)chunkIndexPtr + + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + similarity = (scoreDtype*)((uint8_t*)topkSids + + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); + if (manage_local_topk(desc)) { + topkScores = + (float*)((uint8_t*)similarity + _cuann_aligned(sizeof(scoreDtype) * desc->maxBatchSize * + desc->numProbes * desc->topK)); + simTopkIndex = (uint32_t*)((uint8_t*)topkScores + + _cuann_aligned(sizeof(float) * desc->maxBatchSize * desc->topK)); + preCompScores = + (float*)((uint8_t*)simTopkIndex + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * + desc->numProbes * desc->topK)); + } else { + topkScores = NULL; + simTopkIndex = NULL; + preCompScores = + (float*)((uint8_t*)similarity + + _cuann_aligned(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); + } + topkWorkspace = (void*)((uint8_t*)preCompScores + + _cuann_aligned(sizeof(float) * (handle->deviceProp).multiProcessorCount * + desc->dimPq * (1 << desc->bitPq))); + + // + if (manage_local_topk(desc)) { + dim3 iksThreads(128, 1, 1); + dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); + ivfpq_init_topkScores<<stream>>>( + topkScores, FLT_MAX, numQueries * desc->topK); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif + } + + // + dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE + dim3 mcBlocks(numQueries, 1, 1); + ivfpq_make_chunk_index_ptr<<stream>>>( + desc->numProbes, numQueries, indexPtr, clusterLabelsToProbe, chunkIndexPtr, numSamples); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif + + if (numQueries * desc->numProbes > 256) { + // Sorting index by cluster number (label). + // The goal is to incrase the L2 cache hit rate to read the vectors + // of a cluster by processing the cluster at the same time as much as + // possible. + dim3 psThreads(128, 1, 1); + dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); + ivfpq_prep_sort<<stream>>>(numQueries * desc->numProbes, + indexList); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif + + int begin_bit = 0; + int end_bit = sizeof(uint32_t) * 8; + cub::DeviceRadixSort::SortPairs(cubWorkspace, + desc->sizeCubWorkspace, + clusterLabelsToProbe, + clusterLabelsOut, + indexList, + indexListSorted, + numQueries * desc->numProbes, + begin_bit, + end_bit, + handle->stream); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } + if (0) { + for (uint32_t i = 0; i < numQueries * desc->numProbes; i++) { + fprintf(stderr, "# i:%u, index:%d, label:%u\n", i, indexListSorted[i], clusterLabelsOut[i]); + } + } +#endif + } else { + indexListSorted = NULL; + } + + // Select a GPU kernel for distance calculation +#define SET_KERNEL1(B, V, T, D) \ + do { \ + assert((B * V) % (sizeof(T) * 8) == 0); \ + kernel_no_basediff = ivfpq_compute_similarity; \ + kernel_fast = ivfpq_compute_similarity; \ + kernel_no_smem_lut = ivfpq_compute_similarity_no_smem_lut; \ + } while (0) + +#define SET_KERNEL2(B, M, D) \ + do { \ + assert(desc->dimPq % M == 0); \ + if (desc->dimPq % (M * 8) == 0) { \ + SET_KERNEL1(B, (M * 8), uint64_t, D); \ + } else if (desc->dimPq % (M * 4) == 0) { \ + SET_KERNEL1(B, (M * 4), uint32_t, D); \ + } else if (desc->dimPq % (M * 2) == 0) { \ + SET_KERNEL1(B, (M * 2), uint16_t, D); \ + } else if (desc->dimPq % (M * 1) == 0) { \ + SET_KERNEL1(B, (M * 1), uint8_t, D); \ + } \ + } while (0) + +#define SET_KERNEL3(D) \ + do { \ + switch (desc->bitPq) { \ + case 4: SET_KERNEL2(4, 2, D); break; \ + case 5: SET_KERNEL2(5, 8, D); break; \ + case 6: SET_KERNEL2(6, 4, D); break; \ + case 7: SET_KERNEL2(7, 8, D); break; \ + case 8: SET_KERNEL2(8, 1, D); break; \ + } \ + } while (0) + + typedef void (*kernel_t)(uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + cuannSimilarity_t, + cuannPqCenter_t, + uint32_t, + const float*, + const float*, + const uint8_t*, + const uint32_t*, + const uint32_t*, + const uint32_t*, + const float*, + const uint32_t*, + float*, + float*, + scoreDtype*, + uint32_t*); + kernel_t kernel_no_basediff; + kernel_t kernel_fast; + kernel_t kernel_no_smem_lut; + int depth = 1; + if (manage_local_topk(desc)) { depth = (desc->topK + 31) / 32; } + switch (depth) { + case 1: SET_KERNEL3(1); break; + case 2: SET_KERNEL3(2); break; + case 3: SET_KERNEL3(3); break; + case 4: SET_KERNEL3(4); break; + } + constexpr size_t thresholdSmem = 48 * 1024; + size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); + size_t sizeSmemBaseDiff = sizeof(float) * desc->dimDataset; + + uint32_t numCTAs = numQueries * desc->numProbes; + int numThreads = 1024; + // desc->preferredThreadBlockSize == 0 means using auto thread block size calculation mode + if (desc->preferredThreadBlockSize == 0) { + constexpr int minThreads = 256; + while (numThreads > minThreads) { + if (numCTAs < + uint32_t((handle->deviceProp).multiProcessorCount * (1024 / (numThreads / 2)))) { + break; + } + if ((handle->deviceProp).sharedMemPerMultiprocessor * 2 / 3 < + sizeSmem * (1024 / (numThreads / 2))) { + break; + } + numThreads /= 2; + } + } else { + numThreads = desc->preferredThreadBlockSize; + } + // printf("# numThreads: %d\n", numThreads); + size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); + sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); + + kernel_t kernel = kernel_no_basediff; + + bool kernel_no_basediff_available = true; + if (sizeSmem > thresholdSmem) { + cudaError = cudaFuncSetAttribute( + kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem); + if (cudaError != cudaSuccess) { + kernel_no_basediff_available = false; + + // Use "kernel_no_smem_lut" which just uses small amount of shared memory. + kernel = kernel_no_smem_lut; + numThreads = 1024; + size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); + sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); + numCTAs = (handle->deviceProp).multiProcessorCount; + } + } + if (kernel_no_basediff_available) { + bool kernel_fast_available = true; + if (sizeSmem + sizeSmemBaseDiff > thresholdSmem) { + cudaError = cudaFuncSetAttribute( + kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem + sizeSmemBaseDiff); + if (cudaError != cudaSuccess) { kernel_fast_available = false; } + } +#if 0 + fprintf( stderr, + "# sizeSmem: %lu, sizeSmemBaseDiff: %lu, kernel_fast_available: %d\n", + sizeSmem, sizeSmemBaseDiff, kernel_fast_available ); +#endif + if (kernel_fast_available) { + int numBlocks_kernel_no_basediff = 0; + cudaError = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_no_basediff, kernel_no_basediff, numThreads, sizeSmem); + // fprintf(stderr, "# numBlocks_kernel_no_basediff: %d\n", numBlocks_kernel_no_basediff); + if (cudaError != cudaSuccess) { + fprintf(stderr, "cudaOccupancyMaxActiveBlocksPerMultiprocessor() failed\n"); + exit(-1); + } + + int numBlocks_kernel_fast = 0; + cudaError = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_fast, kernel_fast, numThreads, sizeSmem + sizeSmemBaseDiff); + // fprintf(stderr, "# numBlocks_kernel_fast: %d\n", numBlocks_kernel_fast); + if (cudaError != cudaSuccess) { + fprintf(stderr, "cudaOccupancyMaxActiveBlocksPerMultiprocessor() failed\n"); + exit(-1); + } + + // Use "kernel_fast" only if GPU occupancy does not drop + if (numBlocks_kernel_no_basediff == numBlocks_kernel_fast) { + kernel = kernel_fast; + sizeSmem += sizeSmemBaseDiff; + } + } + } + dim3 ctaThreads(numThreads, 1, 1); + dim3 ctaBlocks(numCTAs, 1, 1); + kernel<<stream>>>(desc->numDataset, + desc->dimRotDataset, + desc->numProbes, + desc->dimPq, + numQueries, + desc->maxSamples, + desc->similarity, + desc->typePqCenter, + desc->topK, + clusterCenters, + pqCenters, + pqDataset, + indexPtr, + clusterLabelsToProbe, + chunkIndexPtr, + query, + indexListSorted, + preCompScores, + topkScores, + (scoreDtype*)similarity, + simTopkIndex); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif + + // Select topk vectors for each query + if (simTopkIndex == NULL) { + _cuann_find_topk(handle, + desc->topK, + numQueries, + desc->maxSamples, + numSamples, + (scoreDtype*)similarity, + topkSids, + topkWorkspace); + } else { + _cuann_find_topk(handle, + desc->topK, + numQueries, + (desc->numProbes * desc->topK), + NULL, + (scoreDtype*)similarity, + topkSids, + topkWorkspace); + } +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif + + // + dim3 moThreads(128, 1, 1); + dim3 moBlocks((desc->topK + moThreads.x - 1) / moThreads.x, numQueries, 1); + ivfpq_make_outputs + <<stream>>>(desc->numProbes, + desc->topK, + desc->maxSamples, + numQueries, + indexPtr, + originalNumbers, + clusterLabelsToProbe, + chunkIndexPtr, + (scoreDtype*)similarity, + simTopkIndex, + topkSids, + topkNeighbors, + topkDistances); +#ifdef CUANN_DEBUG + cudaError = cudaDeviceSynchronize(); + if (cudaError != cudaSuccess) { + fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); + exit(-1); + } +#endif +} + } // namespace raft::spatial::knn::ivf_pq From 4f71b09304bc0870c3273900e4b954206dfde5c0 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 5 Aug 2022 11:14:39 +0200 Subject: [PATCH 003/140] Added tests (still failing) --- .../raft/spatial/knn/detail/ann_utils.cuh | 6 + cpp/include/raft/spatial/knn/ivf_pq.cuh | 781 +++++++++--------- cpp/test/CMakeLists.txt | 1 + cpp/test/spatial/ann_ivf_pq.cu | 442 ++++++++++ 4 files changed, 840 insertions(+), 390 deletions(-) create mode 100644 cpp/test/spatial/ann_ivf_pq.cu diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index e789bafde2..24075ace55 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -151,6 +151,7 @@ inline void memzero(T* ptr, size_t n_elems, rmm::cuda_stream_view stream) } } +namespace { __global__ void argmin_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const float* a, @@ -185,6 +186,7 @@ __global__ void argmin_along_rows_kernel(uint32_t n_rows, } if (threadIdx.x == 0) { out[i] = shm_ids[0]; } } +} // namespace /** * @brief Find index of the smallest element in each row. @@ -209,6 +211,7 @@ inline void argmin_along_rows( argmin_along_rows_kernel<<>>(n_rows, n_cols, a, out); } +namespace { __global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const float* a, float* out) { uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x); @@ -226,6 +229,7 @@ __global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const f sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); if (threadIdx.x == 0) { out[i] = sqsum; } } +} // namespace /** * @brief Square sum of values in each row (row-major matrix). @@ -317,6 +321,7 @@ void accumulate_into_selected(size_t n_rows, } } +namespace { __global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a) { uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x); @@ -338,6 +343,7 @@ __global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a a[j + n_cols * i] *= sqsum; } } +} // namespace /** * @brief Divide rows by their L2 norm (square root of sum of squares). diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index c00c99f813..b50d385c0f 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -209,7 +209,7 @@ struct cuannIvfPqIndexHeader { }; // -char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) +inline char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) { if (dtype == CUDA_R_32F) sprintf(string, "float (CUDA_R_32F)"); @@ -225,7 +225,7 @@ char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) } // -size_t _cuann_aligned(size_t size) +inline size_t _cuann_aligned(size_t size) { size_t unit = 128; if (size % unit) { size += unit - (size % unit); } @@ -233,7 +233,7 @@ size_t _cuann_aligned(size_t size) } // memset -void _cuann_memset(void* ptr, int value, size_t count) +inline void _cuann_memset(void* ptr, int value, size_t count) { cudaPointerAttributes attr; cudaPointerGetAttributes(&attr, ptr); @@ -304,10 +304,10 @@ __global__ void kern_argmin(uint32_t nRows, } // argmin along column -void _cuann_argmin(uint32_t nRows, - uint32_t nCols, - const float* a, // [nRows, nCols] - uint32_t* out // [nRows] +inline void _cuann_argmin(uint32_t nRows, + uint32_t nCols, + const float* a, // [nRows, nCols] + uint32_t* out // [nRows] ) { uint32_t nThreads = 1024; @@ -337,13 +337,13 @@ __global__ void kern_copy(uint32_t nRows, // copy template -void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D divisor) +inline void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D divisor) { uint32_t nThreads = 128; uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; @@ -381,12 +381,12 @@ template void _cuann_copy(uint32_t nRows, // copy_CPU template -void _cuann_copy_CPU(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst) +inline void _cuann_copy_CPU(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst) { for (uint32_t ir = 0; ir < nRows; ir++) { for (uint32_t ic = 0; ic < nCols; ic++) { @@ -422,15 +422,15 @@ __global__ void kern_copy_fill(uint32_t nRows, // copy_fill template -void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D fillValue, - D divisor, - cudaStream_t stream) +inline void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D fillValue, + D divisor, + cudaStream_t stream) { assert(ldSrc >= nCols); assert(ldDst >= nCols); @@ -489,14 +489,14 @@ __global__ void kern_copy_with_list(uint32_t nRows, // copy with row list template -void _cuann_copy_with_list(uint32_t nRows, - uint32_t nCols, - const T* src, // [..., ldSrc] - const uint32_t* rowList, // [nRows,] - uint32_t ldSrc, - float* dst, // [nRows, ldDst] - uint32_t ldDst, - float divisor = 1.0f) +inline void _cuann_copy_with_list(uint32_t nRows, + uint32_t nCols, + const T* src, // [..., ldSrc] + const uint32_t* rowList, // [nRows,] + uint32_t ldSrc, + float* dst, // [nRows, ldDst] + uint32_t ldDst, + float divisor = 1.0f) { cudaPointerAttributes attr; cudaPointerGetAttributes(&attr, src); @@ -556,11 +556,11 @@ __global__ void kern_a_me_b(uint32_t nRows, } // a -= b -void _cuann_a_me_b(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - uint32_t ldA, - float* b // [nCols] +inline void _cuann_a_me_b(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + uint32_t ldA, + float* b // [nCols] ) { uint32_t nThreads = 128; @@ -590,14 +590,14 @@ __global__ void kern_accumulate_with_label(uint32_t nRowsOutput, // accumulate template -void _cuann_accumulate_with_label(uint32_t nRowsOutput, - uint32_t nCols, - float* output, // [nRowsOutput, nCols,] - uint32_t* count, // [nRowsOutput,] - uint32_t nRowsInput, - const T* input, // [nRowsInput, nCols,] - const uint32_t* label, // [nRowsInput,] - float divisor = 1.0) +inline void _cuann_accumulate_with_label(uint32_t nRowsOutput, + uint32_t nCols, + float* output, // [nRowsOutput, nCols,] + uint32_t* count, // [nRowsOutput,] + uint32_t nRowsInput, + const T* input, // [nRowsInput, nCols,] + const uint32_t* label, // [nRowsInput,] + float divisor = 1.0) { bool useGPU = 1; cudaPointerAttributes attr; @@ -655,10 +655,10 @@ __global__ void kern_normalize(uint32_t nRows, } // normalize -void _cuann_normalize(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples = nullptr // [nRows,] +inline void _cuann_normalize(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples = nullptr // [nRows,] ) { dim3 threads(32, 4, 1); // DO NOT CHANGE @@ -681,10 +681,10 @@ __global__ void kern_divide(uint32_t nRows, } // divide -void _cuann_divide(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples // [nRows,] +inline void _cuann_divide(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] ) { dim3 threads(128, 1, 1); @@ -716,16 +716,16 @@ __global__ void kern_transpose_copy_3d(uint32_t num0, // transpose_copy_3d template -void _cuann_transpose_copy_3d(uint32_t num0, - uint32_t num1, - uint32_t num2, - D* dst, // [num2, ld1, ld0] - uint32_t ld0, - uint32_t ld1, - const S* src, // [...] - uint32_t stride0, - uint32_t stride1, - uint32_t stride2) +inline void _cuann_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + D* dst, // [num2, ld1, ld0] + uint32_t ld0, + uint32_t ld1, + const S* src, // [...] + uint32_t stride0, + uint32_t stride1, + uint32_t stride2) { uint32_t nThreads = 128; uint32_t nBlocks = ((num0 * num1 * num2) + nThreads - 1) / nThreads; @@ -755,7 +755,7 @@ __global__ void kern_axpy(int num, T alpha, const T* x, T* y) // template -void _cuann_axpy(int num, T alpha, const T* x, T* y) +inline void _cuann_axpy(int num, T alpha, const T* x, T* y) { uint32_t nThreads = 128; uint32_t nBlocks = (num + nThreads - 1) / nThreads; @@ -823,7 +823,7 @@ T** _cuann_multi_device_malloc(int numDevices, // multi_device_free template -void _cuann_multi_device_free(T** arrays, int numDevices) +inline void _cuann_multi_device_free(T** arrays, int numDevices) { for (int devId = 0; devId < numDevices; devId++) { cudaFree(arrays[devId]); @@ -856,16 +856,16 @@ template void _cuann_multi_device_free(uint8_t** arrays, int numDevices */ // update kmeans centers -void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - uint32_t* clusterSize, // [numCenters] - float* accumulatedCenters) +inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + uint32_t* clusterSize, // [numCenters] + float* accumulatedCenters) { if (accumulatedCenters == NULL) { // accumulate @@ -928,15 +928,15 @@ static cudaStream_t _cuann_set_cublas_stream(cublasHandle_t cublasHandle, cudaSt } // predict label of dataset -void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle, - const float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const float* dataset, // [numDataset, dimCenters] - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - float* workspace) +inline void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle, + const float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const float* dataset, // [numDataset, dimCenters] + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + float* workspace) { cublasStatus_t cublasError; const uint32_t dimDataset = dimCenters; @@ -1010,9 +1010,9 @@ uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDatase } // -size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, - uint32_t dimCenters, - uint32_t numDataset) +inline size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, + uint32_t dimCenters, + uint32_t numDataset) { uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); size_t size = 0; @@ -1026,20 +1026,20 @@ size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, } // predict label of dataset -void _cuann_kmeans_predict(cublasHandle_t cublasHandle, - float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - bool isCenterSet, - void* _workspace, - float* tempCenters, // [numCenters, dimCenters] - uint32_t* clusterSize, // [numCenters,] - bool updateCenter) +inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, + float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + bool isCenterSet, + void* _workspace, + float* tempCenters, // [numCenters, dimCenters] + uint32_t* clusterSize, // [numCenters,] + bool updateCenter) { if (!isCenterSet) { // If centers are not set, the labels will be determined randomly. @@ -1198,19 +1198,19 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle, // // predict label of dataset with multiple devices // -void _cuann_kmeans_predict_MP(int numDevices, - cublasHandle_t* cublasHandles, // [numDevices] - float* clusterCenters, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - bool isCenterSet, - uint32_t* clusterSize, // [numCenters] - bool updateCenter // If true, cluster Centers will be updated. +inline void _cuann_kmeans_predict_MP(int numDevices, + cublasHandle_t* cublasHandles, // [numDevices] + float* clusterCenters, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity, + bool isCenterSet, + uint32_t* clusterSize, // [numCenters] + bool updateCenter // If true, cluster Centers will be updated. ) { // [numDevices][numCenters, dimCenters] @@ -1305,14 +1305,14 @@ void _cuann_kmeans_predict_MP(int numDevices, // predict labe of dataset (naive CPU version). // (*) available only for prediction, but not for training. -void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity) +inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + cuannSimilarity_t similarity) { float multiplier = 1.0; if (dtype == CUDA_R_8U) { @@ -2544,11 +2544,11 @@ __global__ void _sort_topk_prep(uint32_t sizeBatch, } // -size_t _cuann_find_topk_bufferSize(cuannHandle_t handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - cudaDataType_t sampleDtype = CUDA_R_32F) +inline size_t _cuann_find_topk_bufferSize(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + cudaDataType_t sampleDtype = CUDA_R_32F) { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; @@ -2609,15 +2609,15 @@ int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH) } // -void _cuann_find_topk(cuannHandle_t handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - uint32_t* numSamples, // [sizeBatch,] - const float* samples, // [sizeBatch, maxSamples,] - uint32_t* labels, // [sizeBatch, topK,] - void* workspace, - bool sort = false) +inline void _cuann_find_topk(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const float* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; @@ -2740,15 +2740,15 @@ void _cuann_find_topk(cuannHandle_t handle, } // -void _cuann_find_topk(cuannHandle_t handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - uint32_t* numSamples, // [sizeBatch,] - const half* samples, // [sizeBatch, maxSamples,] - uint32_t* labels, // [sizeBatch, topK,] - void* workspace, - bool sort = false) +inline void _cuann_find_topk(cuannHandle_t handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const half* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; @@ -2837,35 +2837,35 @@ void _cuann_find_topk(cuannHandle_t handle, */ // -size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc); +inline size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc); // search template -void ivfpq_search(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - uint32_t numQueries, - const float* clusterCenters, // [numDataset, dimDataset] - const float* pqCenters, // [dimPq, 256, lenPq] - const uint8_t* pqDataset, // [numDataset, dimPq] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* indexPtr, // [numClusters + 1] - const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] - const float* query, // [dimDataset] - uint64_t* topKNeighbors, // [topK] - float* topKDistances, // [topK] - void* workspace); - -void ivfpq_encode(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] +inline void ivfpq_search(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimDataset] + const float* pqCenters, // [dimPq, 256, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [dimDataset] + uint64_t* topKNeighbors, // [topK] + float* topKDistances, // [topK] + void* workspace); + +inline void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] ); // bool manage_local_topk(cuannIvfPqDescriptor_t desc); -size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads); +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads); // __global__ void ivfpq_init_topkScores(float* topkScores, // [num,] @@ -3067,7 +3067,7 @@ __global__ void ivfpq_make_outputs(uint32_t numProbes, } // -bool manage_local_topk(cuannIvfPqDescriptor_t desc) +inline bool manage_local_topk(cuannIvfPqDescriptor_t desc) { int depth = (desc->topK + 31) / 32; if (depth > 4) { return false; } @@ -3077,7 +3077,7 @@ bool manage_local_topk(cuannIvfPqDescriptor_t desc) } // -size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads) +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads) { if (manage_local_topk(desc)) { int topk_32 = (desc->topK + 31) / 32; @@ -3087,7 +3087,7 @@ size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads) } // return workspace size -size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc) +inline size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc) { size_t size = 0; // clusterLabelsOut [maxBatchSize, numProbes] @@ -3251,12 +3251,12 @@ __global__ void ivfpq_encode_kernel(uint32_t numDataset, } // -void ivfpq_encode(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] +inline void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] ) { #if 1 @@ -3318,15 +3318,15 @@ template __global__ void ivfpq_make_outputs( * */ -cuannStatus_t cuannCreate(cuannHandle_t* handle); -cuannStatus_t cuannDestroy(cuannHandle_t handle); -cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream); -cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId); +inline cuannStatus_t cuannCreate(cuannHandle_t* handle); +inline cuannStatus_t cuannDestroy(cuannHandle_t handle); +inline cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream); +inline cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId); -cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); -cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); +inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); +inline cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); -cuannStatus_t cuannIvfPqSetIndexParameters( +inline cuannStatus_t cuannIvfPqSetIndexParameters( cuannIvfPqDescriptor_t desc, const uint32_t numClusters, /* Number of clusters */ const uint32_t numDataset, /* Number of dataset entries */ @@ -3336,19 +3336,19 @@ cuannStatus_t cuannIvfPqSetIndexParameters( const cuannSimilarity_t similarity, const cuannPqCenter_t typePqCenter); -cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - cuannSimilarity_t* similarity, - cuannPqCenter_t* typePqCenter); +inline cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter); -cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, - size_t* size /* bytes of dataset index */); +inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, + size_t* size /* bytes of dataset index */); -cuannStatus_t cuannIvfPqBuildIndex( +inline cuannStatus_t cuannIvfPqBuildIndex( cuannHandle_t handle, cuannIvfPqDescriptor_t desc, const void* dataset, /* [numDataset, dimDataset] */ @@ -3360,74 +3360,75 @@ cuannStatus_t cuannIvfPqBuildIndex( bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ void* index /* database index to build */); -cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const char* fileName); +inline cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName); -cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - void** index, - const char* fileName); +inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName); -cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( cuannHandle_t handle, const char* oldIndexFileName, const char* newIndexFileName, const void* newVectors, /* [numVectorsToAdd, dimDataset] */ uint32_t numNewVectors); -cuannStatus_t cuannIvfPqSetSearchParameters( +inline cuannStatus_t cuannIvfPqSetSearchParameters( cuannIvfPqDescriptor_t desc, const uint32_t numProbes, /* Number of clusters to probe */ const uint32_t topK); /* Number of search results */ -cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize); - -cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numProbes, - uint32_t* topK); - -cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize); - -cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* index, - uint32_t numQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize); - -cuannStatus_t cuannIvfPqSearch(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const void* queries, /* [numQueries, dimDataset] */ - cudaDataType_t dtype, - uint32_t numQueries, - uint64_t* neighbors, /* [numQueries, topK] */ - float* distances, /* [numQueries, topK] */ - void* workspace); - -cuannStatus_t cuannPostprocessingRefine(uint32_t numDataset, - uint32_t numQueries, - uint32_t dimDataset, - const void* dataset, /* [numDataset, dimDataset] */ - const void* queries, /* [numQueries, dimDataset] */ - cudaDataType_t dtype, - cuannSimilarity_t similarity, - uint32_t topK, - const uint64_t* neighbors, /* [numQueries, topK] */ - uint32_t refinedTopK, - uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ - float* refinedDistances /* [numQueries, refinedTopK] */ +inline cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize); + +inline cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK); + +inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize); + +inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t numQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize); + +inline cuannStatus_t cuannIvfPqSearch(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const void* queries, /* [numQueries, dimDataset] */ + cudaDataType_t dtype, + uint32_t numQueries, + uint64_t* neighbors, /* [numQueries, topK] */ + float* distances, /* [numQueries, topK] */ + void* workspace); + +inline cuannStatus_t cuannPostprocessingRefine( + uint32_t numDataset, + uint32_t numQueries, + uint32_t dimDataset, + const void* dataset, /* [numDataset, dimDataset] */ + const void* queries, /* [numQueries, dimDataset] */ + cudaDataType_t dtype, + cuannSimilarity_t similarity, + uint32_t topK, + const uint64_t* neighbors, /* [numQueries, topK] */ + uint32_t refinedTopK, + uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ + float* refinedDistances /* [numQueries, refinedTopK] */ ); -cuannStatus_t cuannPostprocessingMerge( +inline cuannStatus_t cuannPostprocessingMerge( uint32_t numSplit, uint32_t numQueries, uint32_t topK, @@ -3438,13 +3439,13 @@ cuannStatus_t cuannPostprocessingMerge( float* distances /* [numQueries, topK] */ ); -size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t desc) { // [numClusters, dimDatasetExt] return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimDatasetExt); } -size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) { size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { @@ -3456,47 +3457,47 @@ size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) } } -size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t desc) { // [numDataset, dimPq * bitPq / 8] return _cuann_aligned(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); } -size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t desc) { // [numDataset,] return _cuann_aligned(sizeof(uint32_t) * desc->numDataset); } -size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t desc) { // [numClusters + 1,] return _cuann_aligned(sizeof(uint32_t) * (desc->numClusters + 1)); } -size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t desc) { // [dimDataset, dimRotDataset] return _cuann_aligned(sizeof(float) * desc->dimDataset * desc->dimRotDataset); } -size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t desc) { // [numClusters, dimRotDataset] return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimRotDataset); } -void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, - const void* index, - struct cuannIvfPqIndexHeader** header, - float** clusterCenters, // [numClusters, dimDatasetExt] - float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] - uint32_t** originalNumbers, // [numDataset] - uint32_t** indexPtr, // [numClusters + 1] - float** rotationMatrix, // [dimDataset, dimRotDataset] - float** clusterRotCenters // [numClusters, dimRotDataset] +inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, + const void* index, + struct cuannIvfPqIndexHeader** header, + float** clusterCenters, // [numClusters, dimDatasetExt] + float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] + uint32_t** originalNumbers, // [numDataset] + uint32_t** indexPtr, // [numClusters + 1] + float** rotationMatrix, // [dimDataset, dimRotDataset] + float** clusterRotCenters // [numClusters, dimRotDataset] ) { *header = (struct cuannIvfPqIndexHeader*)index; @@ -3531,7 +3532,7 @@ int descending(const void* a, const void* b) } // (*) This is temporal. Need to be removed in future. -void _cuann_get_random_norm_vector(int len, float* vector) +inline void _cuann_get_random_norm_vector(int len, float* vector) { float sqsum = 0.0; for (int i = 0; i < len; i++) { @@ -3544,7 +3545,7 @@ void _cuann_get_random_norm_vector(int len, float* vector) } } -void _cuann_get_inclusiveSumSortedClusterSize( +inline void _cuann_get_inclusiveSumSortedClusterSize( cuannIvfPqDescriptor_t desc, const uint32_t* indexPtr, // [numClusters + 1] float* clusterCenters, // [numClusters, dimDatasetExt] @@ -3588,9 +3589,9 @@ void _cuann_get_inclusiveSumSortedClusterSize( assert((*output)[desc->numClusters - 1] == desc->numDataset); } -void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, - const float* clusterCenters, // [numClusters, dimDataset,] - float** output // [numClusters,] +inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, + const float* clusterCenters, // [numClusters, dimDataset,] + float** output // [numClusters,] ) { cudaError_t cudaError; @@ -3634,11 +3635,11 @@ T _cuann_rand() } // make rotation matrix -void _cuann_make_rotation_matrix(uint32_t nRows, - uint32_t nCols, - uint32_t lenPq, - bool randomRotation, - float* rotationMatrix // [nRows, nCols] +inline void _cuann_make_rotation_matrix(uint32_t nRows, + uint32_t nCols, + uint32_t lenPq, + bool randomRotation, + float* rotationMatrix // [nRows, nCols] ) { assert(nRows >= nCols); @@ -3694,11 +3695,11 @@ void _cuann_make_rotation_matrix(uint32_t nRows, } // show centers (for debuging) -void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const uint32_t* centerSize, - const uint32_t numShow = 5) +inline void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const uint32_t* centerSize, + const uint32_t numShow = 5) { for (uint64_t k = 0; k < numCenters; k++) { if ((numShow <= k) && (k < numCenters - numShow)) { @@ -3718,10 +3719,10 @@ void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenter } // show dataset (for debugging) -void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] - uint32_t numDataset, - uint32_t dimDataset, - const uint32_t numShow = 5) +inline void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] + uint32_t numDataset, + uint32_t dimDataset, + const uint32_t numShow = 5) { for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { @@ -3741,10 +3742,10 @@ void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] } // show pq code (for debuging) -void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] - uint32_t numDataset, - uint32_t dimPq, - const uint32_t numShow = 5) +inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] + uint32_t numDataset, + uint32_t dimPq, + const uint32_t numShow = 5) { for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { @@ -3787,26 +3788,26 @@ uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) } // -void _cuann_compute_PQ_code(cuannHandle_t handle, - uint32_t numDataset, - uint32_t dimDataset, - uint32_t dimRotDataset, - uint32_t dimPq, - uint32_t lenPq, - uint32_t bitPq, - uint32_t numClusters, - cudaDataType_t dtype, - cuannPqCenter_t typePqCenter, - uint32_t maxClusterSize, - float* clusterCenters, // [numClusters, dimDataset] - const float* rotationMatrix, // [dimRotDataset, dimDataset] - const void* dataset, // [numDataset] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterSize, // [numClusters] - const uint32_t* indexPtr, // [numClusters + 1] - float* pqCenters, // [...] - uint32_t numIterations, - uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] +inline void _cuann_compute_PQ_code(cuannHandle_t handle, + uint32_t numDataset, + uint32_t dimDataset, + uint32_t dimRotDataset, + uint32_t dimPq, + uint32_t lenPq, + uint32_t bitPq, + uint32_t numClusters, + cudaDataType_t dtype, + cuannPqCenter_t typePqCenter, + uint32_t maxClusterSize, + float* clusterCenters, // [numClusters, dimDataset] + const float* rotationMatrix, // [dimRotDataset, dimDataset] + const void* dataset, // [numDataset] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterSize, // [numClusters] + const uint32_t* indexPtr, // [numClusters + 1] + float* pqCenters, // [...] + uint32_t numIterations, + uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] ) { // @@ -4065,7 +4066,7 @@ void _cuann_compute_PQ_code(cuannHandle_t handle, } // cuannCreate -cuannStatus_t cuannCreate(cuannHandle_t* handle) +inline cuannStatus_t cuannCreate(cuannHandle_t* handle) { cudaError_t cudaError; cublasStatus_t cublasError; @@ -4122,7 +4123,7 @@ cuannStatus_t cuannCreate(cuannHandle_t* handle) } // cuannDestroy -cuannStatus_t cuannDestroy(cuannHandle_t handle) +inline cuannStatus_t cuannDestroy(cuannHandle_t handle) { if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } cublasStatus_t cublasError; @@ -4141,7 +4142,7 @@ cuannStatus_t cuannDestroy(cuannHandle_t handle) } // cuannSetStream -cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream) +inline cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream) { if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } int devId = handle->devId; @@ -4152,7 +4153,7 @@ cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream) } // cuannSetDevice -cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId) +inline cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId) { if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } if (devId < 0 || devId >= handle->numDevices) { @@ -4176,7 +4177,7 @@ cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId) } // cuannIvfPqCreateDescriptor -cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) +inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) { *desc = (cuannIvfPqDescriptor_t)malloc(sizeof(struct cuannIvfPqDescriptor)); if (*desc == NULL) { return CUANN_STATUS_ALLOC_FAILED; } @@ -4198,7 +4199,7 @@ cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) } // cuannIvfPqDestroyDescriptor -cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) +inline cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } if (desc->sqsumClusters != NULL) { cudaFree(desc->sqsumClusters); } @@ -4207,14 +4208,14 @@ cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) } // cuannIvfPqSetIndexParameters -cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, - const uint32_t numClusters, - const uint32_t numDataset, - const uint32_t dimDataset, - const uint32_t dimPq, - const uint32_t bitPq, - const cuannSimilarity_t similarity, - const cuannPqCenter_t typePqCenter) +inline cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numClusters, + const uint32_t numDataset, + const uint32_t dimDataset, + const uint32_t dimPq, + const uint32_t bitPq, + const cuannSimilarity_t similarity, + const cuannPqCenter_t typePqCenter) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } if (numClusters == 0) { @@ -4299,14 +4300,14 @@ cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetIndexParameters -cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - cuannSimilarity_t* similarity, - cuannPqCenter_t* typePqCenter) +inline cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } @@ -4321,7 +4322,7 @@ cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetIndexSize -cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) +inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } @@ -4341,16 +4342,16 @@ cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) } // cuannIvfPqBuildIndex -cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* dataset, - const void* trainset, - cudaDataType_t dtype, - uint32_t numTrainset, - uint32_t numIterations, - bool randomRotation, - bool hierarchicalClustering, - void* index) +inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* dataset, + const void* trainset, + cudaDataType_t dtype, + uint32_t numTrainset, + uint32_t numIterations, + bool randomRotation, + bool hierarchicalClustering, + void* index) { if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } int cuannDevId = handle->devId; @@ -5111,10 +5112,10 @@ cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, } // cuannIvfPqSaveIndex -cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const char* fileName) +inline cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName) { if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } int orgDevId = _cuann_set_device(handle->devId); @@ -5137,10 +5138,10 @@ cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, } // cuannIvfPqLoadIndex -cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - void** index, - const char* fileName) +inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName) { if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } int orgDevId = _cuann_set_device(handle->devId); @@ -5245,7 +5246,7 @@ cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, } // cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex -cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( cuannHandle_t handle, const char* oldIndexFileName, const char* newIndexFileName, @@ -5587,9 +5588,9 @@ cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( } // cuannIvfPqSetSearchParameters -cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, - const uint32_t numProbes, - const uint32_t topK) +inline cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numProbes, + const uint32_t topK) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } if (numProbes == 0) { @@ -5653,10 +5654,10 @@ cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqSetSearchParameters -cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize) +inline cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } if (internalDistanceDtype != CUDA_R_16F && internalDistanceDtype != CUDA_R_32F) { @@ -5690,9 +5691,9 @@ cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetSearchParameters -cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numProbes, - uint32_t* topK) +inline cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } *numProbes = desc->numProbes; @@ -5701,10 +5702,10 @@ cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetSearchTuningParameters -cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize) +inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize) { if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } *internalDistanceDtype = desc->internalDistanceDtype; @@ -5714,12 +5715,12 @@ cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqSearch -cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - const void* index, - uint32_t maxQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize) +inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t maxQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize) { if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } @@ -5803,7 +5804,7 @@ cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, } // cuannIvfPqSearch -cuannStatus_t cuannIvfPqSearch( +inline cuannStatus_t cuannIvfPqSearch( cuannHandle_t handle, cuannIvfPqDescriptor_t desc, const void* index, @@ -6920,19 +6921,19 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( // search template -void ivfpq_search(cuannHandle_t handle, - cuannIvfPqDescriptor_t desc, - uint32_t numQueries, - const float* clusterCenters, // [numDataset, dimRotDataset] - const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* indexPtr, // [numClusters + 1] - const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] - const float* query, // [numQueries, dimRotDataset] - uint64_t* topkNeighbors, // [numQueries, topK] - float* topkDistances, // [numQueries, topK] - void* workspace) +inline void ivfpq_search(cuannHandle_t handle, + cuannIvfPqDescriptor_t desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimRotDataset] + const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [numQueries, dimRotDataset] + uint64_t* topkNeighbors, // [numQueries, topK] + float* topkDistances, // [numQueries, topK] + void* workspace) { assert(numQueries <= desc->maxBatchSize); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 82d381bbb5..fd20a07ec2 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -111,6 +111,7 @@ add_executable(test_raft test/sparse/sort.cu test/sparse/symmetrize.cu test/spatial/ann_ivf_flat.cu + test/spatial/ann_ivf_pq.cu test/spatial/knn.cu test/spatial/fused_l2_knn.cu test/spatial/haversine.cu diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu new file mode 100644 index 0000000000..f6351b8025 --- /dev/null +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "./ann_base_kernel.cuh" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace raft { +namespace spatial { +namespace knn { +struct IvfPqInputs { + int num_queries; + int num_db_vecs; + int dim; + int k; + int nprobe; + int nlist; + raft::distance::DistanceType metric; +}; + +template +struct idx_dist_pair { + IdxT idx; + DistT dist; + compareDist eq_compare; + bool operator==(const idx_dist_pair& a) const + { + if (idx == a.idx) return true; + if (eq_compare(dist, a.dist)) return true; + return false; + } + idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {} +}; + +template +auto eval_knn(const std::vector& expected_idx, + const std::vector& actual_idx, + const std::vector& expected_dist, + const std::vector& actual_dist, + size_t rows, + size_t cols, + const DistT eps, + double min_recall) -> testing::AssertionResult +{ + size_t match_count = 0; + size_t total_count = static_cast(rows) * static_cast(cols); + for (size_t i = 0; i < rows; ++i) { + for (size_t k = 0; k < cols; ++k) { + size_t idx_k = i * cols + k; // row major assumption! + auto act_idx = actual_idx[idx_k]; + auto act_dist = actual_dist[idx_k]; + for (size_t j = 0; j < cols; ++j) { + size_t idx = i * cols + j; // row major assumption! + auto exp_idx = expected_idx[idx]; + auto exp_dist = expected_dist[idx]; + idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox(eps)); + idx_dist_pair act_kvp(act_idx, act_dist, raft::CompareApprox(eps)); + if (exp_kvp == act_kvp) { + match_count++; + break; + } + } + } + } + RAFT_LOG_INFO("Recall = %zu/%zu", match_count, total_count); + double actual_recall = static_cast(match_count) / static_cast(total_count); + if (actual_recall < min_recall - eps) { + if (actual_recall < min_recall * min_recall - eps) { + RAFT_LOG_ERROR("Recall is much lower than the minimum (%f < %f)", actual_recall, min_recall); + } else { + RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall); + } + if (match_count == 0 || actual_recall < min_recall * std::min(min_recall, 0.5) - eps) { + return testing::AssertionFailure() + << "actual recall (" << actual_recall + << ") is much smaller than the minimum expected recall (" << min_recall << ")."; + } + } + return testing::AssertionSuccess(); +} + +#define CUANN_CHECK(ret) RAFT_EXPECTS(ret == ivf_pq::CUANN_STATUS_SUCCESS, "cuann failure: %d", ret) + +template +class IvfPqTest : public ::testing::TestWithParam { + public: + IvfPqTest() + : stream_(handle_.get_stream()), + ps(::testing::TestWithParam::GetParam()), + database(0, stream_, &managed_memory), + search_queries(0, stream_) + { + } + + protected: + void testIvfPq() + { + size_t queries_size = ps.num_queries * ps.k; + std::vector indices_ivf_pq(queries_size); + std::vector indices_naive(queries_size); + std::vector distances_ivf_pq(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indices_naive_dev(queries_size, stream_); + using acc_t = typename detail::utils::config::value_t; + naiveBfKnn(distances_naive_dev.data(), + indices_naive_dev.data(), + search_queries.data(), + database.data(), + ps.num_queries, + ps.num_db_vecs, + ps.dim, + ps.k, + ps.metric, + 2.0f, + stream_); + update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + handle_.sync_stream(stream_); + } + + { + // unless something is really wrong with clustering, this could serve as a lower bound on + // recall + double min_recall = static_cast(ps.nprobe) / static_cast(ps.nlist); + + rmm::device_uvector distances_ivf_pq_dev(queries_size, stream_); + rmm::device_uvector indices_ivf_pq_dev(queries_size, stream_); + + { + std::unique_ptr> + cuann_handle{[]() { + ivf_pq::cuannHandle_t h; + CUANN_CHECK(ivf_pq::cuannCreate(&h)); + return h; + }(), + [](ivf_pq::cuannHandle_t h) { ivf_pq::cuannDestroy(h); }}; + std::unique_ptr> + cuann_desc{ + []() { + ivf_pq::cuannIvfPqDescriptor_t d; + CUANN_CHECK(ivf_pq::cuannIvfPqCreateDescriptor(&d)); + return d; + }(), + [](ivf_pq::cuannIvfPqDescriptor_t d) { ivf_pq::cuannIvfPqDestroyDescriptor(d); }}; + + CUANN_CHECK(ivf_pq::cuannSetDevice(cuann_handle.get(), handle_.get_device())); + CUANN_CHECK(ivf_pq::cuannSetStream(cuann_handle.get(), handle_.get_stream())); + + // Number of kmeans clusters. + // + // The number of vectors per cluster, or 'numDataset' / 'numClusters', + // should be approximately 1,000 to 10,000. + uint32_t n_clusters = ps.nlist; + // Important parameters of the index to create. + // + // 'bitPq' is the bit length of the vector element after compression by PQ. + // 'dimPq' is the dimensionality of the vector after compression by PQ. + // + // 'bitPq' is 4, 5, 6, 7, or 8. The smaller the 'bitPq', the smaller the + // index size and the better the search performance, but the lower the recall. + // + // Similarly, a smaller 'dimPq' results in a smaller index size and better + // search performance, but lower recall. If 'bitPq' is 8, 'dimPq' can be set + // to any number, but multiple of 8 are desirable for good performance. + // If 'bitPq' is not 8, 'dimPq' must be basically multiple of 8. For good + // performance, multiple 32 is desirable. + // + uint32_t bitPq = 8; + uint32_t dimPq = ps.dim; + if (dimPq >= 128) { + dimPq = raft::alignDown(dimPq / 2, 32); + } else if (dimPq >= 32) { + dimPq = raft::alignDown(dimPq, 32); + } else if (dimPq >= 8) { + dimPq = raft::alignDown(dimPq, 8); + } + // If true, dataset and query vectors are rotated by random rotation matrix + // created at indexing time. + // + bool randomRotation = ps.dim < 1024; // disable for large-dimensional data (CPU intensive) + // Number of iterations for kmeans training. + uint32_t numIterations = 25; + // metric + ivf_pq::cuannSimilarity_t similarity = + ps.metric == raft::distance::DistanceType::InnerProduct ? ivf_pq::CUANN_SIMILARITY_INNER + : ivf_pq::CUANN_SIMILARITY_L2; + // Specify whether PQ codebooks are created per subspace or per cluster. + ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; + CUANN_CHECK(ivf_pq::cuannIvfPqSetIndexParameters( + cuann_desc.get(), + n_clusters, /* Number of clusters */ + uint32_t(ps.num_db_vecs), /* Number of dataset entries */ + uint32_t(ps.dim), /* Dimension of each entry */ + dimPq, /* Dimension of each entry after product quantization */ + bitPq, /* Bit length of PQ */ + similarity, + typePqCenter)); + + // Allocate memory for index + size_t ivf_pq_index_size; + CUANN_CHECK(ivf_pq::cuannIvfPqGetIndexSize(cuann_desc.get(), &ivf_pq_index_size)); + rmm::device_buffer ivf_pq_index_buf_managed(ivf_pq_index_size, stream_, &managed_memory); + + // Build index + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_8U; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8I; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } + CUANN_CHECK(ivf_pq::cuannIvfPqBuildIndex( + cuann_handle.get(), + cuann_desc.get(), + database.data(), // dataset + database.data(), // ?kmeans? trainset + dtype, + uint32_t(ps.num_db_vecs), // size of the trainset (I guess for kmeans) + numIterations, + randomRotation, + true, // hierarchialClustering: always true in raft + ivf_pq_index_buf_managed.data() // memory allocated for the index + )); + handle_.sync_stream(stream_); + + // set search parameters + CUANN_CHECK(ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc.get(), ps.nprobe, ps.k)); + // Data type of LUT to be created dynamically at search time. + // + // The use of low-precision types reduces the amount of shared memory + // required at search time, so fast shared memory kernels can be used even + // for datasets with large dimansionality. Note that the recall is slightly + // degraded when low-precision type is selected. + // + cudaDataType_t smemLutDtype = CUDA_R_32F; + // smemLutDtype = CUDA_R_16F; + // smemLutDtype = CUDA_R_8U; + // Storage data type for distance/similarity computed at search time. + // + // If the performance limiter at search time is device memory access, + // selecting FP16 will improve performance slightly. + // + cudaDataType_t internalDistanceDtype = CUDA_R_32F; + // internalDistanceDtype = CUDA_R_16F; + + // Thread block size of the distance calculation kernel at search time. + // + // If 0, the thread block size is determined automatically. + // + uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 + CUANN_CHECK(ivf_pq::cuannIvfPqSetSearchTuningParameters( + cuann_desc.get(), internalDistanceDtype, smemLutDtype, preferredThreadBlockSize)); + // Maximum number of query vectors to search. + uint32_t maxQueries = 1000000; + // Maximum number of query vectors to search at the same time. + uint32_t batchSize = maxQueries; + // Maximum device memory size that may be used as workspace at search time. + // maxSearchWorkspaceSize = 0; // default + size_t maxSearchWorkspaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GiB + + // Allocate memory for index + size_t ivf_pq_search_workspace_size; + CUANN_CHECK(ivf_pq::cuannIvfPqSearch_bufferSize(cuann_handle.get(), + cuann_desc.get(), + ivf_pq_index_buf_managed.data(), + batchSize, + maxSearchWorkspaceSize, + &ivf_pq_search_workspace_size)); + rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); + + // finally, search! + CUANN_CHECK(cuannIvfPqSearch(cuann_handle.get(), + cuann_desc.get(), + ivf_pq_index_buf_managed.data(), + search_queries.data(), + dtype, + ps.num_queries, + indices_ivf_pq_dev.data(), + distances_ivf_pq_dev.data(), + ivf_pq_search_ws_buf.data())); + handle_.sync_stream(stream_); + + update_host(distances_ivf_pq.data(), distances_ivf_pq_dev.data(), queries_size, stream_); + update_host(indices_ivf_pq.data(), + reinterpret_cast(indices_ivf_pq_dev.data()), + queries_size, + stream_); + handle_.sync_stream(stream_); + } + handle_.sync_stream(stream_); + ASSERT_TRUE(eval_knn(indices_naive, + indices_ivf_pq, + distances_naive, + distances_ivf_pq, + ps.num_queries, + ps.k, + float(0.001), + min_recall)); + } + } + + void SetUp() override + { + database.resize(ps.num_db_vecs * ps.dim, stream_); + search_queries.resize(ps.num_queries * ps.dim, stream_); + + raft::random::Rng r(1234ULL); + if constexpr (std::is_same{}) { + r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_); + r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_); + } else { + r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_); + r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_); + } + handle_.sync_stream(stream_); + } + + void TearDown() override + { + handle_.sync_stream(stream_); + database.resize(0, stream_); + search_queries.resize(0, stream_); + } + + private: + raft::handle_t handle_; + rmm::cuda_stream_view stream_; + rmm::mr::managed_memory_resource managed_memory; + IvfPqInputs ps; + rmm::device_uvector database; + rmm::device_uvector search_queries; +}; + +const std::vector inputs = { + // test various dims (aligned and not aligned to vector sizes) + {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct}, + + // test dims that do not fit into kernel shared memory limits + {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded}, + + // various random combinations + {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded}, + {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded}, + {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded}, + {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded}, + {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded}, + + {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct}, + {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct}, + {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct}, + {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct}, + {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct}, + + {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct}, + + // test splitting the big query batches (> max gridDim.y) into smaller batches + {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct}, + {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct}, + + // test radix_sort for getting the cluster selection + {1000, + 10000, + 16, + 10, + raft::spatial::knn::detail::topk::kMaxCapacity * 2, + raft::spatial::knn::detail::topk::kMaxCapacity * 4, + raft::distance::DistanceType::L2Expanded}, + {1000, + 10000, + 16, + 10, + raft::spatial::knn::detail::topk::kMaxCapacity * 4, + raft::spatial::knn::detail::topk::kMaxCapacity * 4, + raft::distance::DistanceType::InnerProduct}}; + +typedef IvfPqTest IvfPqTestF; +TEST_P(IvfPqTestF, IvfPq) { this->testIvfPq(); } + +INSTANTIATE_TEST_CASE_P(IvfPqTest, IvfPqTestF, ::testing::ValuesIn(inputs)); + +typedef IvfPqTest IvfPqTestF_uint8; +TEST_P(IvfPqTestF_uint8, IvfPq) { this->testIvfPq(); } + +INSTANTIATE_TEST_CASE_P(IvfPqTest, IvfPqTestF_uint8, ::testing::ValuesIn(inputs)); + +typedef IvfPqTest IvfPqTestF_int8; +TEST_P(IvfPqTestF_int8, IvfPq) { this->testIvfPq(); } + +INSTANTIATE_TEST_CASE_P(IvfPqTest, IvfPqTestF_int8, ::testing::ValuesIn(inputs)); + +} // namespace knn +} // namespace spatial +} // namespace raft From 1d6402f7f094b366e3f7841e151e7e6f65646c76 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 Aug 2022 14:46:46 +0200 Subject: [PATCH 004/140] WIP replacing chunks of code with raft's helpers --- .../knn/detail/ann_kmeans_balanced.cuh | 30 +- cpp/include/raft/spatial/knn/ivf_pq.cuh | 716 +++++++----------- cpp/test/spatial/ann_ivf_pq.cu | 20 +- 3 files changed, 278 insertions(+), 488 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 74e1ae75a8..7fadd6367f 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -53,16 +53,16 @@ namespace raft::spatial::knn::detail::kmeans { * @param stream * @param mr (optional) memory resource to use for temporary allocations */ -void predict_float_core(const handle_t& handle, - const float* centers, - uint32_t n_clusters, - uint32_t dim, - const float* dataset, - size_t n_rows, - uint32_t* labels, - raft::distance::DistanceType metric, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +inline void predict_float_core(const handle_t& handle, + const float* centers, + uint32_t n_clusters, + uint32_t dim, + const float* dataset, + size_t n_rows, + uint32_t* labels, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_uvector distances(n_rows * n_clusters, stream, mr); @@ -115,7 +115,7 @@ void predict_float_core(const handle_t& handle, * @param n_rows dataset size * @return a suggested minibatch size */ -constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t +constexpr inline auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t { n_clusters = std::max(1, n_clusters); uint32_t minibatch_size = (1 << 20); @@ -411,10 +411,10 @@ void build_clusters(const handle_t& handle, } /** Calculate how many fine clusters should belong to each mesocluster. */ -auto arrange_fine_clusters(uint32_t n_clusters, - uint32_t n_mesoclusters, - size_t n_rows, - const uint32_t* mesocluster_sizes) +inline auto arrange_fine_clusters(uint32_t n_clusters, + uint32_t n_mesoclusters, + size_t n_rows, + const uint32_t* mesocluster_sizes) { std::vector fine_clusters_nums(n_mesoclusters); std::vector fine_clusters_csum(n_mesoclusters + 1); diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index b50d385c0f..ad849b7ba3 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -15,8 +15,10 @@ */ #pragma once +#include "detail/ann_kmeans_balanced.cuh" #include "detail/ann_utils.cuh" +#include #include #include @@ -144,20 +146,6 @@ typedef enum { CUANN_PQ_CENTER_PER_CLUSTER = 1, } cuannPqCenter_t; -/* Context */ -struct cuannContext { - int devId; - cudaStream_t stream; - cudaDeviceProp deviceProp; - cublasHandle_t cublasHandle; - - int numDevices; - cudaStream_t* streams; - cudaDeviceProp* deviceProps; - cublasHandle_t* cublasHandles; -}; -typedef struct cuannContext* cuannHandle_t; - /* IvfPq */ struct cuannIvfPqDescriptor { uint32_t numClusters; @@ -248,23 +236,6 @@ inline void _cuann_memset(void* ptr, int value, size_t count) } } -// outer add -__global__ void kern_outer_add(const float* a, - uint32_t numA, - const float* b, - uint32_t numB, - float* c // [numA, numB] -) -{ - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t iA = gid / numB; - uint64_t iB = gid % numB; - if (iA >= numA) return; - float valA = (a == NULL) ? 0.0 : a[iA]; - float valB = (b == NULL) ? 0.0 : b[iB]; - c[gid] = valA + valB; -} - // argmin along column __global__ void kern_argmin(uint32_t nRows, uint32_t nCols, @@ -927,71 +898,6 @@ static cudaStream_t _cuann_set_cublas_stream(cublasHandle_t cublasHandle, cudaSt return cublasStream; } -// predict label of dataset -inline void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle, - const float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const float* dataset, // [numDataset, dimCenters] - uint32_t numDataset, - uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - float* workspace) -{ - cublasStatus_t cublasError; - const uint32_t dimDataset = dimCenters; - float* sqsumCenters; // [numCenters] - float* sqsumDataset; // [numDataset] - float* distances; // [numDataset, numCenters] - - sqsumCenters = workspace; - sqsumDataset = sqsumCenters + numCenters; - distances = sqsumDataset + numDataset; - - float alpha; - float beta; - if (similarity == CUANN_SIMILARITY_INNER) { - alpha = -1.0; - beta = 0.0; - } else { - detail::utils::dots_along_rows( - numCenters, dimCenters, centers, sqsumCenters, rmm::cuda_stream_default); - detail::utils::dots_along_rows( - numDataset, dimDataset, dataset, sqsumDataset, rmm::cuda_stream_default); - - detail::utils::outer_add( - sqsumDataset, numDataset, sqsumCenters, numCenters, distances, rmm::cuda_stream_default); - alpha = -2.0; - beta = 1.0; - } - cudaStream_t cublasStream = _cuann_set_cublas_stream(cublasHandle, NULL); - cublasError = cublasGemmEx(cublasHandle, - CUBLAS_OP_T, - CUBLAS_OP_N, - numCenters, - numDataset, - dimCenters, - &alpha, - centers, - CUDA_R_32F, - dimCenters, - dataset, - CUDA_R_32F, - dimDataset, - &beta, - distances, - CUDA_R_32F, - numCenters, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); - exit(-1); - } - _cuann_set_cublas_stream(cublasHandle, cublasStream); - _cuann_argmin(numDataset, numCenters, distances, labels); -} - // uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset) { @@ -1026,7 +932,7 @@ inline size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, } // predict label of dataset -inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, +inline void _cuann_kmeans_predict(const handle_t& handle, float* centers, // [numCenters, dimCenters] uint32_t numCenters, uint32_t dimCenters, @@ -1075,11 +981,11 @@ inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, } float* curDataset; // [chunk, dimCenters] void* bufDataset; // [chunk, dimCenters] - float* workspace_core; + // float* workspace_core; curDataset = (float*)workspace; bufDataset = (void*)((uint8_t*)curDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); - workspace_core = - (float*)((uint8_t*)bufDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + // workspace_core = + // (float*)((uint8_t*)bufDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); if (tempCenters != NULL && clusterSize != NULL) { _cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters); @@ -1095,28 +1001,40 @@ inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, kind = cudaMemcpyHostToDevice; } + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, numCenters * chunk); + if (pool_guard) { + RAFT_LOG_DEBUG("_cuann_kmeans_predict: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + auto stream = handle.get_stream(); + auto metric = similarity == CUANN_SIMILARITY_INNER ? raft::distance::DistanceType::InnerProduct + : raft::distance::DistanceType::L2Expanded; + for (uint64_t is = 0; is < numDataset; is += chunk) { uint64_t ie = min(is + chunk, (uint64_t)numDataset); uint32_t nDataset = ie - is; + // RAFT_LOG_INFO( + // "_cuann_kmeans_predict(dimCenters = %u, nDataset = %u, is = %zu)", dimCenters, nDataset, + // is); if (dtype == CUDA_R_32F) { - cudaError = cudaMemcpyAsync(bufDataset, - (float*)dataset + (is * dimCenters), - sizeof(float) * nDataset * dimCenters, - kind, - NULL); + // TODO: CRASH: Program hit cudaErrorIllegalAddress (error 700) due to "an illegal memory + // access was encountered" on CUDA API call to cudaMemcpyAsync_ptsz. + cudaError = cudaMemcpy(bufDataset, + (float*)dataset + (is * dimCenters), + sizeof(float) * nDataset * dimCenters, + kind); } else if (dtype == CUDA_R_8U) { - cudaError = cudaMemcpyAsync(bufDataset, - (uint8_t*)dataset + (is * dimCenters), - sizeof(uint8_t) * nDataset * dimCenters, - kind, - NULL); + cudaError = cudaMemcpy(bufDataset, + (uint8_t*)dataset + (is * dimCenters), + sizeof(uint8_t) * nDataset * dimCenters, + kind); } else if (dtype == CUDA_R_8I) { - cudaError = cudaMemcpyAsync(bufDataset, - (int8_t*)dataset + (is * dimCenters), - sizeof(int8_t) * nDataset * dimCenters, - kind, - NULL); + cudaError = cudaMemcpy(bufDataset, + (int8_t*)dataset + (is * dimCenters), + sizeof(int8_t) * nDataset * dimCenters, + kind); } if (cudaError != cudaSuccess) { fprintf(stderr, "(%s, %d) cudaMemcpy() failed.\n", __func__, __LINE__); @@ -1153,15 +1071,19 @@ inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, } // predict - _cuann_kmeans_predict_core(cublasHandle, - centers, - numCenters, - dimCenters, - curDataset, - nDataset, - labels + is, - similarity, - workspace_core); + stream.synchronize(); + detail::kmeans::predict_float_core(handle, + centers, + numCenters, + dimCenters, + curDataset, + nDataset, + labels + is, + metric, + stream, + device_memory); + stream.synchronize(); + if ((tempCenters != NULL) && (clusterSize != NULL)) { // accumulate @@ -1198,9 +1120,8 @@ inline void _cuann_kmeans_predict(cublasHandle_t cublasHandle, // // predict label of dataset with multiple devices // -inline void _cuann_kmeans_predict_MP(int numDevices, - cublasHandle_t* cublasHandles, // [numDevices] - float* clusterCenters, // [numCenters, dimCenters] +inline void _cuann_kmeans_predict_MP(const handle_t& handle, + float* clusterCenters, // [numCenters, dimCenters] uint32_t numCenters, uint32_t dimCenters, const void* dataset, // [numDataset, dimCenters] @@ -1213,6 +1134,7 @@ inline void _cuann_kmeans_predict_MP(int numDevices, bool updateCenter // If true, cluster Centers will be updated. ) { + int numDevices = 1; // [numDevices][numCenters, dimCenters] float** clusterCentersCopy = _cuann_multi_device_malloc( numDevices, numCenters * dimCenters, "clusterCentersCopy", true /* use cudaMalloc() */); @@ -1252,7 +1174,7 @@ inline void _cuann_kmeans_predict_MP(int numDevices, } else if (dtype == CUDA_R_8I) { ptrDataset = (void*)((int8_t*)dataset + (uint64_t)dimCenters * d0); } - _cuann_kmeans_predict(cublasHandles[devId], + _cuann_kmeans_predict(handle, clusterCentersCopy[devId], numCenters, dimCenters, @@ -2544,7 +2466,7 @@ __global__ void _sort_topk_prep(uint32_t sizeBatch, } // -inline size_t _cuann_find_topk_bufferSize(cuannHandle_t handle, +inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, uint32_t topK, uint32_t sizeBatch, uint32_t maxSamples, @@ -2564,8 +2486,8 @@ inline size_t _cuann_find_topk_bufferSize(cuannHandle_t handle, // state if (stateBitLen == 8) { // (*) Each thread has at least one array element for state - uint32_t numBlocks_perBatch = - ((handle->deviceProp).multiProcessorCount * 2 + sizeBatch) / sizeBatch; + uint32_t numBlocks_perBatch = (getMultiProcessorCount() * 2 + sizeBatch) / sizeBatch; + uint32_t numThreads_perBatch = numThreads * numBlocks_perBatch; uint32_t numSample_perThread = (maxSamples + numThreads_perBatch - 1) / numThreads_perBatch; uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; @@ -2609,7 +2531,7 @@ int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH) } // -inline void _cuann_find_topk(cuannHandle_t handle, +inline void _cuann_find_topk(const handle_t& handle, uint32_t topK, uint32_t sizeBatch, uint32_t maxSamples, @@ -2623,7 +2545,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, constexpr int stateBitLen = STATE_BIT_LENGTH; assert(stateBitLen == 0 || stateBitLen == 8); #ifdef CUANN_DEBUG - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle->stream); + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream()); #endif // Limit the maximum value of vecLen to 4. In the case of FP32, @@ -2643,9 +2565,9 @@ inline void _cuann_find_topk(cuannHandle_t handle, cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize); int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); - int numBlocks = min(numBlocks_perBatch * sizeBatch, - (handle->deviceProp).multiProcessorCount * numBlocksPerSm_topk); - numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + int numBlocks = + min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); if (maxSamples <= numThreads * 10) { // When number of sample is small, using multiple thread-blocks does not // improve performance, in which case cta_kernel is used. Tentatively, @@ -2674,7 +2596,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, } else if (vecLen == 1) { cta_kernel = kern_topk_cta_11; } - cta_kernel<<stream>>>( + cta_kernel<<>>( topK, sizeBatch, maxSamples, numSamples, (const uint32_t*)samples, state, labels); } else { void* args[9]; @@ -2687,7 +2609,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, args[6] = {&(labels)}; args[7] = {&(count)}; args[8] = {nullptr}; - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle->stream); + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream()); } if (!sort) { return; } @@ -2703,7 +2625,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, dim3 stpThreads(128, 1, 1); dim3 stpBlocks((max(sizeBatch + 1, sizeBatch * topK) + stpThreads.x - 1) / stpThreads.x, 1, 1); - _sort_topk_prep<<stream>>>( + _sort_topk_prep<<>>( sizeBatch, topK, maxSamples, labels, samples, offsets, keys_in); size_t cub_ws_size = 0; @@ -2730,17 +2652,17 @@ inline void _cuann_find_topk(cuannHandle_t handle, offsets + 1, (int)0, (int)(sizeof(float) * 8), - handle->stream); + handle.get_stream()); cudaMemcpyAsync(labels, values_out, sizeof(uint32_t) * sizeBatch * topK, cudaMemcpyDeviceToDevice, - handle->stream); + handle.get_stream()); } // -inline void _cuann_find_topk(cuannHandle_t handle, +inline void _cuann_find_topk(const handle_t& handle, uint32_t topK, uint32_t sizeBatch, uint32_t maxSamples, @@ -2754,7 +2676,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, constexpr int stateBitLen = STATE_BIT_LENGTH; assert(stateBitLen == 0 || stateBitLen == 8); #ifdef CUANN_DEBUG - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle->stream); + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream()); #endif int vecLen = _get_vecLen(maxSamples); @@ -2772,9 +2694,9 @@ inline void _cuann_find_topk(cuannHandle_t handle, int numBlocksPerSm_topk; cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0); int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); - int numBlocks = min(numBlocks_perBatch * sizeBatch, - (handle->deviceProp).multiProcessorCount * numBlocksPerSm_topk); - numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + int numBlocks = + min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); if (maxSamples <= numThreads * 10) { // When number of sample is small, using multiple thread-blocks does not // improve performance, in which case cta_kernel is used. Tentatively, @@ -2803,7 +2725,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, } else if (vecLen == 1) { cta_kernel = kern_topk_cta_8; } - cta_kernel<<stream>>>( + cta_kernel<<>>( topK, sizeBatch, maxSamples, numSamples, (const uint16_t*)samples, state, labels); } else { void* args[9]; @@ -2816,7 +2738,7 @@ inline void _cuann_find_topk(cuannHandle_t handle, args[6] = {&(labels)}; args[7] = {&(count)}; args[8] = {nullptr}; - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle->stream); + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream()); } } @@ -2837,11 +2759,11 @@ inline void _cuann_find_topk(cuannHandle_t handle, */ // -inline size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc); +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc); // search template -inline void ivfpq_search(cuannHandle_t handle, +inline void ivfpq_search(const handle_t& handle, cuannIvfPqDescriptor_t desc, uint32_t numQueries, const float* clusterCenters, // [numDataset, dimDataset] @@ -3087,7 +3009,7 @@ inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThrea } // return workspace size -inline size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor_t desc) +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc) { size_t size = 0; // clusterLabelsOut [maxBatchSize, numProbes] @@ -3139,8 +3061,8 @@ inline size_t ivfpq_search_bufferSize(cuannHandle_t handle, cuannIvfPqDescriptor size += _cuann_aligned(sizeof(float) * desc->maxBatchSize * desc->topK); } // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] - size += _cuann_aligned(sizeof(float) * (handle->deviceProp).multiProcessorCount * desc->dimPq * - (1 << desc->bitPq)); + size += + _cuann_aligned(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); // topkWorkspace if (manage_local_topk(desc)) { size += _cuann_find_topk_bufferSize(handle, @@ -3318,11 +3240,6 @@ template __global__ void ivfpq_make_outputs( * */ -inline cuannStatus_t cuannCreate(cuannHandle_t* handle); -inline cuannStatus_t cuannDestroy(cuannHandle_t handle); -inline cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream); -inline cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId); - inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); inline cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); @@ -3349,7 +3266,7 @@ inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size /* bytes of dataset index */); inline cuannStatus_t cuannIvfPqBuildIndex( - cuannHandle_t handle, + const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* dataset, /* [numDataset, dimDataset] */ const void* trainset, /* [numTrainset, dimDataset] */ @@ -3360,18 +3277,18 @@ inline cuannStatus_t cuannIvfPqBuildIndex( bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ void* index /* database index to build */); -inline cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqSaveIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, const char* fileName); -inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqLoadIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, void** index, const char* fileName); inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( - cuannHandle_t handle, + const handle_t& handle, const char* oldIndexFileName, const char* newIndexFileName, const void* newVectors, /* [numVectorsToAdd, dimDataset] */ @@ -3396,14 +3313,14 @@ inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t cudaDataType_t* smemLutDtype, uint32_t* preferredThreadBlockSize); -inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqSearch_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, uint32_t numQueries, size_t maxWorkspaceSize, size_t* workspaceSize); -inline cuannStatus_t cuannIvfPqSearch(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqSearch(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, const void* queries, /* [numQueries, dimDataset] */ @@ -3601,8 +3518,15 @@ inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); exit(-1); } + switch (detail::utils::check_pointer_residency(clusterCenters, *output)) { + case detail::utils::pointer_residency::device_only: + case detail::utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("_cuann_get_sqsumClusters: not all pointers are available on the device."); + } + rmm::cuda_stream_default.synchronize(); detail::utils::dots_along_rows( desc->numClusters, desc->dimDataset, clusterCenters, *output, rmm::cuda_stream_default); + rmm::cuda_stream_default.synchronize(); } // @@ -3788,7 +3712,7 @@ uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) } // -inline void _cuann_compute_PQ_code(cuannHandle_t handle, +inline void _cuann_compute_PQ_code(const handle_t& handle, uint32_t numDataset, uint32_t dimDataset, uint32_t dimRotDataset, @@ -3819,23 +3743,20 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, float** subVectors; // [numDevices][dimPq, maxClusterSize, lenPq] uint32_t** subVectorLabels; // [numDevices][dimPq, maxClusterSize] uint8_t** myPqDataset; // [numDevices][maxCluserSize, dimPq * bitPq / 8] - resVectors = _cuann_multi_device_malloc( - handle->numDevices, maxClusterSize * dimDataset, "resVectors"); - rotVectors = _cuann_multi_device_malloc( - handle->numDevices, maxClusterSize * dimRotDataset, "rotVectors"); - subVectors = _cuann_multi_device_malloc( - handle->numDevices, dimPq * maxClusterSize * lenPq, "subVectors"); - subVectorLabels = _cuann_multi_device_malloc( - handle->numDevices, dimPq * maxClusterSize, "subVectorLabels"); - myPqDataset = _cuann_multi_device_malloc( - handle->numDevices, maxClusterSize * dimPq * bitPq / 8, "myPqDataset"); + resVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimDataset, "resVectors"); + rotVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimRotDataset, "rotVectors"); + subVectors = _cuann_multi_device_malloc(1, dimPq * maxClusterSize * lenPq, "subVectors"); + subVectorLabels = + _cuann_multi_device_malloc(1, dimPq * maxClusterSize, "subVectorLabels"); + myPqDataset = + _cuann_multi_device_malloc(1, maxClusterSize * dimPq * bitPq / 8, "myPqDataset"); uint32_t maxTrainset = 0; if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { maxTrainset = _get_num_trainset(maxClusterSize, dimPq, bitPq); } void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( - handle->numDevices, + 1, _cuann_kmeans_predict_bufferSize((1 << bitPq), lenPq, max(maxClusterSize, maxTrainset)), "pqPredictWorkspace"); @@ -3846,18 +3767,15 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, float** myPqCentersTemp; // [numDevices][1 << bitPq, lenPq] if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { memset(pqCenters, 0, sizeof(float) * numClusters * (1 << bitPq) * lenPq); - rotVectorLabels = _cuann_multi_device_malloc( - handle->numDevices, maxClusterSize * dimPq, "rotVectorLabels"); - pqClusterSize = - _cuann_multi_device_malloc(handle->numDevices, (1 << bitPq), "pqClusterSize"); - wsKAC = _cuann_multi_device_malloc(handle->numDevices, 1, "wsKAC"); - myPqCenters = - _cuann_multi_device_malloc(handle->numDevices, (1 << bitPq) * lenPq, "myPqCenters"); - myPqCentersTemp = _cuann_multi_device_malloc( - handle->numDevices, (1 << bitPq) * lenPq, "myPqCentersTemp"); - } - -#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) + rotVectorLabels = + _cuann_multi_device_malloc(1, maxClusterSize * dimPq, "rotVectorLabels"); + pqClusterSize = _cuann_multi_device_malloc(1, (1 << bitPq), "pqClusterSize"); + wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); + myPqCenters = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCenters"); + myPqCentersTemp = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCentersTemp"); + } + +#pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t l = 0; l < numClusters; l++) { int devId = omp_get_thread_num(); cudaSetDevice(devId); @@ -3909,10 +3827,10 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, // // Rotate the residual vectors using a rotation matrix // - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle->cublasHandles[devId], NULL); + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); float alpha = 1.0; float beta = 0.0; - cublasStatus_t cublasError = cublasGemmEx(handle->cublasHandles[devId], + cublasStatus_t cublasError = cublasGemmEx(handle.get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, dimRotDataset, @@ -3936,7 +3854,7 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, // return CUANN_STATUS_CUBLAS_ERROR; exit(-1); } - _cuann_set_cublas_stream(handle->cublasHandles[devId], cublasStream); + _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); // // Training PQ codebook if CUANN_PQ_CENTER_PER_CLUSTER @@ -3957,7 +3875,7 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, (float)iter / 2, numIterations); } - _cuann_kmeans_predict(handle->cublasHandles[devId], + _cuann_kmeans_predict(handle, myPqCenters[devId], (1 << bitPq), lenPq, @@ -4019,7 +3937,7 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * l; if (numIterations > 0) { curPqCenters = myPqCenters[devId]; } } - _cuann_kmeans_predict(handle->cublasHandles[devId], + _cuann_kmeans_predict(handle, curPqCenters, (1 << bitPq), lenPq, @@ -4050,130 +3968,19 @@ inline void _cuann_compute_PQ_code(cuannHandle_t handle, fprintf(stderr, "\n"); // - _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, handle->numDevices); - _cuann_multi_device_free(myPqDataset, handle->numDevices); - _cuann_multi_device_free(subVectorLabels, handle->numDevices); - _cuann_multi_device_free(subVectors, handle->numDevices); - _cuann_multi_device_free(rotVectors, handle->numDevices); - _cuann_multi_device_free(resVectors, handle->numDevices); + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); + _cuann_multi_device_free(myPqDataset, 1); + _cuann_multi_device_free(subVectorLabels, 1); + _cuann_multi_device_free(subVectors, 1); + _cuann_multi_device_free(rotVectors, 1); + _cuann_multi_device_free(resVectors, 1); if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { - _cuann_multi_device_free(wsKAC, handle->numDevices); - _cuann_multi_device_free(rotVectorLabels, handle->numDevices); - _cuann_multi_device_free(pqClusterSize, handle->numDevices); - _cuann_multi_device_free(myPqCenters, handle->numDevices); - _cuann_multi_device_free(myPqCentersTemp, handle->numDevices); - } -} - -// cuannCreate -inline cuannStatus_t cuannCreate(cuannHandle_t* handle) -{ - cudaError_t cudaError; - cublasStatus_t cublasError; - - *handle = (cuannHandle_t)malloc(sizeof(struct cuannContext)); - if (*handle == NULL) { return CUANN_STATUS_ALLOC_FAILED; } - - // Keep the current device ID. - int devId; - cudaError = cudaGetDevice(&devId); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaGetDevice() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } - - // numDevices - cudaGetDeviceCount(&((*handle)->numDevices)); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaGetDeviceCount() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } - - (*handle)->streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * (*handle)->numDevices); - (*handle)->deviceProps = (cudaDeviceProp*)malloc(sizeof(cudaDeviceProp) * (*handle)->numDevices); - (*handle)->cublasHandles = - (cublasHandle_t*)malloc(sizeof(cublasHandle_t) * (*handle)->numDevices); - - for (int i = 0; i < (*handle)->numDevices; i++) { - cudaError = cudaSetDevice(i); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaSetDevice() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } - - // stream - (*handle)->streams[i] = NULL; - - // deviceProp - cudaError = cudaGetDeviceProperties(&((*handle)->deviceProps[i]), i); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaGetDeviceProperties() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } - - // cublasHandle - cublasError = cublasCreate(&((*handle)->cublasHandles[i])); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasCreate() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUBLAS_ERROR; - } - } - - return cuannSetDevice(*handle, devId); -} - -// cuannDestroy -inline cuannStatus_t cuannDestroy(cuannHandle_t handle) -{ - if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - cublasStatus_t cublasError; - for (int i = 0; i < handle->numDevices; i++) { - cublasError = cublasDestroy(handle->cublasHandles[i]); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasDestroy() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUBLAS_ERROR; - } - } - free(handle->streams); - free(handle->deviceProps); - free(handle->cublasHandles); - free(handle); - return CUANN_STATUS_SUCCESS; -} - -// cuannSetStream -inline cuannStatus_t cuannSetStream(cuannHandle_t handle, cudaStream_t stream) -{ - if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - int devId = handle->devId; - cublasSetStream(handle->cublasHandles[devId], stream); - handle->streams[devId] = stream; - - return cuannSetDevice(handle, devId); -} - -// cuannSetDevice -inline cuannStatus_t cuannSetDevice(cuannHandle_t handle, int devId) -{ - if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - if (devId < 0 || devId >= handle->numDevices) { - fprintf( - stderr, "(%s, %d) devId is out of range (devId:%d) failed.\n", __func__, __LINE__, devId); - return CUANN_STATUS_INVALID_VALUE; + _cuann_multi_device_free(wsKAC, 1); + _cuann_multi_device_free(rotVectorLabels, 1); + _cuann_multi_device_free(pqClusterSize, 1); + _cuann_multi_device_free(myPqCenters, 1); + _cuann_multi_device_free(myPqCentersTemp, 1); } - - // (*) Need to re-consider whether it is good to call cudaSetDevice() here. - cudaError_t cudaError = cudaSetDevice(devId); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaSetDevice() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } - - handle->devId = devId; - handle->stream = handle->streams[devId]; - handle->deviceProp = handle->deviceProps[devId]; - handle->cublasHandle = handle->cublasHandles[devId]; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqCreateDescriptor @@ -4342,7 +4149,7 @@ inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* } // cuannIvfPqBuildIndex -inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* dataset, const void* trainset, @@ -4353,8 +4160,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, bool hierarchicalClustering, void* index) { - if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - int cuannDevId = handle->devId; + int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { @@ -4424,7 +4230,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, return CUANN_STATUS_ALLOC_FAILED; } - uint32_t** wsKAC = _cuann_multi_device_malloc(handle->numDevices, 1, "wsKAC"); + uint32_t** wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); // // Training kmeans @@ -4475,7 +4281,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, __func__, (float)iter / 2, numIterations); - _cuann_kmeans_predict(handle->cublasHandle, + _cuann_kmeans_predict(handle, mesoClusterCenters, numMesoClusters, desc->dimDataset, @@ -4538,24 +4344,23 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, assert(csumFineClusters[numMesoClusters] == desc->numClusters); uint32_t** idsTrainset = - _cuann_multi_device_malloc(handle->numDevices, mesoClusterSizeMax, "idsTrainset"); + _cuann_multi_device_malloc(1, mesoClusterSizeMax, "idsTrainset"); - float** subTrainset = _cuann_multi_device_malloc( - handle->numDevices, mesoClusterSizeMax * desc->dimDataset, "subTrainset"); + float** subTrainset = + _cuann_multi_device_malloc(1, mesoClusterSizeMax * desc->dimDataset, "subTrainset"); // label (cluster ID) of each vector - uint32_t** labelsMP = - _cuann_multi_device_malloc(handle->numDevices, mesoClusterSizeMax, "labelsMP"); + uint32_t** labelsMP = _cuann_multi_device_malloc(1, mesoClusterSizeMax, "labelsMP"); float** clusterCentersEach = _cuann_multi_device_malloc( - handle->numDevices, numFineClustersMax * desc->dimDataset, "clusterCentersEach"); + 1, numFineClustersMax * desc->dimDataset, "clusterCentersEach"); float** clusterCentersMP = _cuann_multi_device_malloc( - handle->numDevices, numFineClustersMax * desc->dimDataset, "clusterCentersMP"); + 1, numFineClustersMax * desc->dimDataset, "clusterCentersMP"); // number of vectors in each cluster uint32_t** clusterSizeMP = - _cuann_multi_device_malloc(handle->numDevices, numFineClustersMax, "clusterSizeMP"); + _cuann_multi_device_malloc(1, numFineClustersMax, "clusterSizeMP"); size_t sizePredictWorkspace = 0; for (uint32_t i = 0; i < numMesoClusters; i++) { @@ -4566,13 +4371,13 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, mesoClusterSize[i] // number of vectors )); } - void** predictWorkspace = (void**)_cuann_multi_device_malloc( - handle->numDevices, sizePredictWorkspace, "predictWorkspace"); + void** predictWorkspace = + (void**)_cuann_multi_device_malloc(1, sizePredictWorkspace, "predictWorkspace"); // // Training kmeans for clusters in each meso-clusters // -#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) +#pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t i = 0; i < numMesoClusters; i++) { int devId = omp_get_thread_num(); cudaSetDevice(devId); @@ -4625,7 +4430,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, (float)iter / 2, numIterations); } - _cuann_kmeans_predict(handle->cublasHandles[devId], + _cuann_kmeans_predict(handle, clusterCentersEach[devId], numFineClusters[i], desc->dimDataset, @@ -4658,20 +4463,20 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, sizeof(float) * numFineClusters[i] * desc->dimDataset, cudaMemcpyDeviceToDevice); } - for (int devId = 0; devId < handle->numDevices; devId++) { + for (int devId = 0; devId < 1; devId++) { cudaSetDevice(devId); cudaDeviceSynchronize(); } fprintf(stderr, "\n"); cudaSetDevice(cuannDevId); - _cuann_multi_device_free(idsTrainset, handle->numDevices); - _cuann_multi_device_free(subTrainset, handle->numDevices); - _cuann_multi_device_free(labelsMP, handle->numDevices); - _cuann_multi_device_free(clusterCentersEach, handle->numDevices); - _cuann_multi_device_free(clusterCentersMP, handle->numDevices); - _cuann_multi_device_free(clusterSizeMP, handle->numDevices); - _cuann_multi_device_free((uint8_t**)predictWorkspace, handle->numDevices); + _cuann_multi_device_free(idsTrainset, 1); + _cuann_multi_device_free(subTrainset, 1); + _cuann_multi_device_free(labelsMP, 1); + _cuann_multi_device_free(clusterCentersEach, 1); + _cuann_multi_device_free(clusterCentersMP, 1); + _cuann_multi_device_free(clusterSizeMP, 1); + _cuann_multi_device_free((uint8_t**)predictWorkspace, 1); cudaFree(mesoClusterSize); cudaFree(mesoClusterLabels); @@ -4699,8 +4504,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, __func__, (float)iter / X, numIterations_X / X); - _cuann_kmeans_predict_MP(handle->numDevices, - handle->cublasHandles, + _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, desc->dimDataset, @@ -4733,7 +4537,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, for (int iter = 0; iter < numIterations_2; iter += 2) { fprintf( stderr, "(%s) Training kmeans: %.1f / %u \r", __func__, (float)iter / 2, numIterations); - _cuann_kmeans_predict(handle->cublasHandle, + _cuann_kmeans_predict(handle, clusterCenters, desc->numClusters, desc->dimDataset, @@ -4775,8 +4579,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, // Predict labels of whole dataset (with multiple GPUs) // fprintf(stderr, "(%s) Final fitting\n", __func__); - _cuann_kmeans_predict_MP(handle->numDevices, - handle->cublasHandles, + _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, desc->dimDataset, @@ -4802,10 +4605,10 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); // Rotate clusterCenters - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle->cublasHandle, NULL); + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); float alpha = 1.0; float beta = 0.0; - cublasStatus_t cublasError = cublasGemmEx(handle->cublasHandle, + cublasStatus_t cublasError = cublasGemmEx(handle.get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, desc->dimRotDataset, @@ -4828,7 +4631,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); return CUANN_STATUS_CUBLAS_ERROR; } - _cuann_set_cublas_stream(handle->cublasHandle, cublasStream); + _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); // // Make indexPtr, originalNumbers and pqDataset @@ -4860,21 +4663,21 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, } // [numDevices][1 << bitPq, lenPq] - float** pqCentersTemp = _cuann_multi_device_malloc( - handle->numDevices, (1 << desc->bitPq) * desc->lenPq, "pqCentersTemp"); + float** pqCentersTemp = + _cuann_multi_device_malloc(1, (1 << desc->bitPq) * desc->lenPq, "pqCentersTemp"); // [numDevices][1 << bitPq,] uint32_t** pqClusterSize = - _cuann_multi_device_malloc(handle->numDevices, (1 << desc->bitPq), "pqClusterSize"); + _cuann_multi_device_malloc(1, (1 << desc->bitPq), "pqClusterSize"); // Allocate workspace for PQ codebook training size_t sizePqPredictWorkspace = _cuann_kmeans_predict_bufferSize((1 << desc->bitPq), desc->lenPq, numTrainset); - sizePqPredictWorkspace = max(sizePqPredictWorkspace, + sizePqPredictWorkspace = max(sizePqPredictWorkspace, _cuann_kmeans_predict_bufferSize( (1 << desc->bitPq), desc->lenPq, maxClusterSize * desc->dimPq)); - void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( - handle->numDevices, sizePqPredictWorkspace, "pqPredictWorkspace"); + void** pqPredictWorkspace = + (void**)_cuann_multi_device_malloc(1, sizePqPredictWorkspace, "pqPredictWorkspace"); if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { // @@ -4884,8 +4687,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, // Predict label of trainset again (with multiple GPUs) fprintf(stderr, "(%s) Predict label of trainset again\n", __func__); - _cuann_kmeans_predict_MP(handle->numDevices, - handle->cublasHandles, + _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, desc->dimDataset, @@ -4944,17 +4746,17 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, } // [numDevices][numTrainset, lenPq] - float** subTrainset = _cuann_multi_device_malloc( - handle->numDevices, numTrainset * desc->lenPq, "subTrainset"); + float** subTrainset = + _cuann_multi_device_malloc(1, numTrainset * desc->lenPq, "subTrainset"); // [numDevices][numTrainset] uint32_t** subTrainsetLabels = - _cuann_multi_device_malloc(handle->numDevices, numTrainset, "subTrainsetLabels"); + _cuann_multi_device_malloc(1, numTrainset, "subTrainsetLabels"); - float** pqCentersEach = _cuann_multi_device_malloc( - handle->numDevices, ((1 << desc->bitPq) * desc->lenPq), "pqCentersEach"); + float** pqCentersEach = + _cuann_multi_device_malloc(1, ((1 << desc->bitPq) * desc->lenPq), "pqCentersEach"); -#pragma omp parallel for schedule(dynamic) num_threads(handle->numDevices) +#pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t j = 0; j < desc->dimPq; j++) { int devId = omp_get_thread_num(); cudaSetDevice(devId); @@ -4977,7 +4779,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, (float)iter / 2, numIterations); } - _cuann_kmeans_predict(handle->cublasHandles[devId], + _cuann_kmeans_predict(handle, pqCentersEach[devId], (1 << desc->bitPq), desc->lenPq, @@ -5020,9 +4822,9 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, fprintf(stderr, "\n"); cudaSetDevice(cuannDevId); - _cuann_multi_device_free(subTrainset, handle->numDevices); - _cuann_multi_device_free(subTrainsetLabels, handle->numDevices); - _cuann_multi_device_free(pqCentersEach, handle->numDevices); + _cuann_multi_device_free(subTrainset, 1); + _cuann_multi_device_free(subTrainsetLabels, 1); + _cuann_multi_device_free(pqCentersEach, 1); free(modTrainset); } @@ -5100,25 +4902,24 @@ inline cuannStatus_t cuannIvfPqBuildIndex(cuannHandle_t handle, cudaFree(datasetLabels); cudaFree(clusterCentersTemp); - _cuann_multi_device_free(wsKAC, handle->numDevices); - _cuann_multi_device_free(pqCentersTemp, handle->numDevices); - _cuann_multi_device_free(pqClusterSize, handle->numDevices); - _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, handle->numDevices); + _cuann_multi_device_free(wsKAC, 1); + _cuann_multi_device_free(pqCentersTemp, 1); + _cuann_multi_device_free(pqClusterSize, 1); + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); - cuannSetDevice(handle, cuannDevId); _cuann_set_device(callerDevId); return CUANN_STATUS_SUCCESS; } // cuannIvfPqSaveIndex -inline cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqSaveIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, const char* fileName) { - if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - int orgDevId = _cuann_set_device(handle->devId); + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle.get_device()); FILE* fp = fopen(fileName, "w"); if (fp == NULL) { @@ -5138,13 +4939,13 @@ inline cuannStatus_t cuannIvfPqSaveIndex(cuannHandle_t handle, } // cuannIvfPqLoadIndex -inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqLoadIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, void** index, const char* fileName) { - if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - int orgDevId = _cuann_set_device(handle->devId); + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle.get_device()); if (1 /* *index == NULL */) { FILE* fp = fopen(fileName, "r"); @@ -5167,7 +4968,7 @@ inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, } fclose(fp); - cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle->devId); + cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle.get_device()); } struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(*index); @@ -5211,34 +5012,34 @@ inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, size_t size; // pqDataset size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; - if (size < (handle->deviceProp).totalGlobalMem) { - cudaMemPrefetchAsync(pqDataset, size, handle->devId); + if (size < handle.get_device_properties().totalGlobalMem) { + cudaMemPrefetchAsync(pqDataset, size, handle.get_device()); } // clusterCenters size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; - cudaMemPrefetchAsync(clusterCenters, size, handle->devId); + cudaMemPrefetchAsync(clusterCenters, size, handle.get_device()); // pqCenters if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; } else { size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; } - cudaMemPrefetchAsync(pqCenters, size, handle->devId); + cudaMemPrefetchAsync(pqCenters, size, handle.get_device()); // originalNumbers size = sizeof(uint32_t) * desc->numDataset; - cudaMemPrefetchAsync(originalNumbers, size, handle->devId); + cudaMemPrefetchAsync(originalNumbers, size, handle.get_device()); // indexPtr size = sizeof(uint32_t) * (desc->numClusters + 1); - cudaMemPrefetchAsync(indexPtr, size, handle->devId); + cudaMemPrefetchAsync(indexPtr, size, handle.get_device()); // rotationMatrix if (rotationMatrix != NULL) { size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; - cudaMemPrefetchAsync(rotationMatrix, size, handle->devId); + cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device()); } // clusterRotCenters if (clusterRotCenters != NULL) { size = sizeof(float) * desc->numClusters * desc->dimRotDataset; - cudaMemPrefetchAsync(clusterRotCenters, size, handle->devId); + cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device()); } _cuann_set_device(orgDevId); @@ -5247,7 +5048,7 @@ inline cuannStatus_t cuannIvfPqLoadIndex(cuannHandle_t handle, // cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( - cuannHandle_t handle, + const handle_t& handle, const char* oldIndexFileName, const char* newIndexFileName, const void* newVectors, /* [numNewVectors, dimDataset] */ @@ -5255,14 +5056,13 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( { cudaError_t cudaError; cuannStatus_t ret; - if (handle == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } cudaPointerAttributes attr; cudaPointerGetAttributes(&attr, newVectors); if (attr.type == cudaMemoryTypeDevice) { fprintf(stderr, "(%s, %d) newVectors must be accessible from the host.\n", __func__, __LINE__); return CUANN_STATUS_INVALID_POINTER; } - int cuannDevId = handle->devId; + int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); // @@ -5335,8 +5135,7 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( } cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters); fprintf(stderr, "(%s) Predict label of new vectors\n", __func__); - _cuann_kmeans_predict_MP(handle->numDevices, - handle->cublasHandles, + _cuann_kmeans_predict_MP(handle, clusterCenters, oldDesc->numClusters, oldDesc->dimDataset, @@ -5581,7 +5380,6 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( cudaFree(clusterCenters); cudaFree(oldIndex); - cuannSetDevice(handle, cuannDevId); _cuann_set_device(callerDevId); return CUANN_STATUS_SUCCESS; @@ -5715,14 +5513,14 @@ inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t } // cuannIvfPqSearch -inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, +inline cuannStatus_t cuannIvfPqSearch_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, uint32_t maxQueries, size_t maxWorkspaceSize, size_t* workspaceSize) { - if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } size_t max_ws = maxWorkspaceSize; if (max_ws == 0) { @@ -5771,7 +5569,7 @@ inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, if (1) { // Adjust maxBatchSize to improve GPU occupancy of topk kernel. - uint32_t numCta_total = (handle->deviceProp).multiProcessorCount * 2; + uint32_t numCta_total = getMultiProcessorCount() * 2; uint32_t numCta_perBatch = numCta_total / desc->maxBatchSize; float utilization = (float)numCta_perBatch * desc->maxBatchSize / numCta_total; if (numCta_perBatch > 1 || (numCta_perBatch == 1 && utilization < 0.6)) { @@ -5805,7 +5603,7 @@ inline cuannStatus_t cuannIvfPqSearch_bufferSize(cuannHandle_t handle, // cuannIvfPqSearch inline cuannStatus_t cuannIvfPqSearch( - cuannHandle_t handle, + const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, const void* queries, // [numQueries, dimDataset], host or device pointer @@ -5815,8 +5613,8 @@ inline cuannStatus_t cuannIvfPqSearch( float* distances, // [numQueries, topK], device pointer void* workspace) { - if (handle == NULL || desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - int orgDevId = _cuann_set_device(handle->devId); + if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + int orgDevId = _cuann_set_device(handle.get_device()); if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { return CUANN_STATUS_UNSUPPORTED_DTYPE; @@ -5866,7 +5664,7 @@ inline cuannStatus_t cuannIvfPqSearch( searchWorkspace = (void*)((uint8_t*)clusterLabelsToProbe + _cuann_aligned(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); - void (*_ivfpq_search)(cuannHandle_t, + void (*_ivfpq_search)(const handle_t&, cuannIvfPqDescriptor_t, uint32_t, const float*, @@ -5933,7 +5731,7 @@ inline cuannStatus_t cuannIvfPqSearch( ptrQueries, sizeof(float) * nQueries * desc->dimDataset, cudaMemcpyHostToDevice, - handle->stream); + handle.get_stream()); ptrQueries = (float*)devQueries; } _cuann_copy_fill(nQueries, @@ -5944,7 +5742,7 @@ inline cuannStatus_t cuannIvfPqSearch( desc->dimDatasetExt, fillValue, divisor, - handle->stream); + handle.get_stream()); } else if (dtype == CUDA_R_8U) { uint8_t* ptrQueries = (uint8_t*)queries + ((uint64_t)(desc->dimDataset) * i); if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { @@ -5952,7 +5750,7 @@ inline cuannStatus_t cuannIvfPqSearch( ptrQueries, sizeof(uint8_t) * nQueries * desc->dimDataset, cudaMemcpyHostToDevice, - handle->stream); + handle.get_stream()); ptrQueries = (uint8_t*)devQueries; } _cuann_copy_fill(nQueries, @@ -5963,7 +5761,7 @@ inline cuannStatus_t cuannIvfPqSearch( desc->dimDatasetExt, fillValue, divisor, - handle->stream); + handle.get_stream()); } else if (dtype == CUDA_R_8I) { int8_t* ptrQueries = (int8_t*)queries + ((uint64_t)(desc->dimDataset) * i); if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { @@ -5971,7 +5769,7 @@ inline cuannStatus_t cuannIvfPqSearch( ptrQueries, sizeof(int8_t) * nQueries * desc->dimDataset, cudaMemcpyHostToDevice, - handle->stream); + handle.get_stream()); ptrQueries = (int8_t*)devQueries; } _cuann_copy_fill(nQueries, @@ -5982,7 +5780,7 @@ inline cuannStatus_t cuannIvfPqSearch( desc->dimDatasetExt, fillValue, divisor, - handle->stream); + handle.get_stream()); } float alpha; @@ -5997,7 +5795,7 @@ inline cuannStatus_t cuannIvfPqSearch( gemmK = desc->dimDataset + 1; assert(gemmK <= desc->dimDatasetExt); } - cublasError = cublasGemmEx(handle->cublasHandle, + cublasError = cublasGemmEx(handle.get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, desc->numClusters, @@ -6024,7 +5822,7 @@ inline cuannStatus_t cuannIvfPqSearch( // Rotate queries alpha = 1.0; beta = 0.0; - cublasError = cublasGemmEx(handle->cublasHandle, + cublasError = cublasGemmEx(handle.get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, desc->dimRotDataset, @@ -6921,7 +6719,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( // search template -inline void ivfpq_search(cuannHandle_t handle, +inline void ivfpq_search(const handle_t& handle, cuannIvfPqDescriptor_t desc, uint32_t numQueries, const float* clusterCenters, // [numDataset, dimRotDataset] @@ -6984,15 +6782,15 @@ inline void ivfpq_search(cuannHandle_t handle, (float*)((uint8_t*)similarity + _cuann_aligned(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); } - topkWorkspace = (void*)((uint8_t*)preCompScores + - _cuann_aligned(sizeof(float) * (handle->deviceProp).multiProcessorCount * - desc->dimPq * (1 << desc->bitPq))); + topkWorkspace = + (void*)((uint8_t*)preCompScores + _cuann_aligned(sizeof(float) * getMultiProcessorCount() * + desc->dimPq * (1 << desc->bitPq))); // if (manage_local_topk(desc)) { dim3 iksThreads(128, 1, 1); dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); - ivfpq_init_topkScores<<stream>>>( + ivfpq_init_topkScores<<>>( topkScores, FLT_MAX, numQueries * desc->topK); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); @@ -7006,7 +6804,7 @@ inline void ivfpq_search(cuannHandle_t handle, // dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE dim3 mcBlocks(numQueries, 1, 1); - ivfpq_make_chunk_index_ptr<<stream>>>( + ivfpq_make_chunk_index_ptr<<>>( desc->numProbes, numQueries, indexPtr, clusterLabelsToProbe, chunkIndexPtr, numSamples); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); @@ -7023,8 +6821,8 @@ inline void ivfpq_search(cuannHandle_t handle, // possible. dim3 psThreads(128, 1, 1); dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); - ivfpq_prep_sort<<stream>>>(numQueries * desc->numProbes, - indexList); + ivfpq_prep_sort<<>>(numQueries * desc->numProbes, + indexList); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); if (cudaError != cudaSuccess) { @@ -7044,7 +6842,7 @@ inline void ivfpq_search(cuannHandle_t handle, numQueries * desc->numProbes, begin_bit, end_bit, - handle->stream); + handle.get_stream()); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); if (cudaError != cudaSuccess) { @@ -7126,7 +6924,14 @@ inline void ivfpq_search(cuannHandle_t handle, case 2: SET_KERNEL3(2); break; case 3: SET_KERNEL3(3); break; case 4: SET_KERNEL3(4); break; - } + default: RAFT_FAIL("ivf_pq::search(k = %u): depth value is too big (%d)", desc->topK, depth); + } + RAFT_LOG_INFO("ivf_pq::search(k = %u, depth = %d, dim = %u/%u/%u)", + desc->topK, + depth, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq); constexpr size_t thresholdSmem = 48 * 1024; size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); size_t sizeSmemBaseDiff = sizeof(float) * desc->dimDataset; @@ -7137,11 +6942,8 @@ inline void ivfpq_search(cuannHandle_t handle, if (desc->preferredThreadBlockSize == 0) { constexpr int minThreads = 256; while (numThreads > minThreads) { - if (numCTAs < - uint32_t((handle->deviceProp).multiProcessorCount * (1024 / (numThreads / 2)))) { - break; - } - if ((handle->deviceProp).sharedMemPerMultiprocessor * 2 / 3 < + if (numCTAs < uint32_t(getMultiProcessorCount() * (1024 / (numThreads / 2)))) { break; } + if (handle.get_device_properties().sharedMemPerMultiprocessor * 2 / 3 < sizeSmem * (1024 / (numThreads / 2))) { break; } @@ -7168,7 +6970,7 @@ inline void ivfpq_search(cuannHandle_t handle, numThreads = 1024; size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); - numCTAs = (handle->deviceProp).multiProcessorCount; + numCTAs = getMultiProcessorCount(); } } if (kernel_no_basediff_available) { @@ -7211,27 +7013,27 @@ inline void ivfpq_search(cuannHandle_t handle, } dim3 ctaThreads(numThreads, 1, 1); dim3 ctaBlocks(numCTAs, 1, 1); - kernel<<stream>>>(desc->numDataset, - desc->dimRotDataset, - desc->numProbes, - desc->dimPq, - numQueries, - desc->maxSamples, - desc->similarity, - desc->typePqCenter, - desc->topK, - clusterCenters, - pqCenters, - pqDataset, - indexPtr, - clusterLabelsToProbe, - chunkIndexPtr, - query, - indexListSorted, - preCompScores, - topkScores, - (scoreDtype*)similarity, - simTopkIndex); + kernel<<>>(desc->numDataset, + desc->dimRotDataset, + desc->numProbes, + desc->dimPq, + numQueries, + desc->maxSamples, + desc->similarity, + desc->typePqCenter, + desc->topK, + clusterCenters, + pqCenters, + pqDataset, + indexPtr, + clusterLabelsToProbe, + chunkIndexPtr, + query, + indexListSorted, + preCompScores, + topkScores, + (scoreDtype*)similarity, + simTopkIndex); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); if (cudaError != cudaSuccess) { @@ -7272,19 +7074,19 @@ inline void ivfpq_search(cuannHandle_t handle, dim3 moThreads(128, 1, 1); dim3 moBlocks((desc->topK + moThreads.x - 1) / moThreads.x, numQueries, 1); ivfpq_make_outputs - <<stream>>>(desc->numProbes, - desc->topK, - desc->maxSamples, - numQueries, - indexPtr, - originalNumbers, - clusterLabelsToProbe, - chunkIndexPtr, - (scoreDtype*)similarity, - simTopkIndex, - topkSids, - topkNeighbors, - topkDistances); + <<>>(desc->numProbes, + desc->topK, + desc->maxSamples, + numQueries, + indexPtr, + originalNumbers, + clusterLabelsToProbe, + chunkIndexPtr, + (scoreDtype*)similarity, + simTopkIndex, + topkSids, + topkNeighbors, + topkDistances); #ifdef CUANN_DEBUG cudaError = cudaDeviceSynchronize(); if (cudaError != cudaSuccess) { diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index f6351b8025..145a7237bd 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -159,13 +159,6 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_uvector indices_ivf_pq_dev(queries_size, stream_); { - std::unique_ptr> - cuann_handle{[]() { - ivf_pq::cuannHandle_t h; - CUANN_CHECK(ivf_pq::cuannCreate(&h)); - return h; - }(), - [](ivf_pq::cuannHandle_t h) { ivf_pq::cuannDestroy(h); }}; std::unique_ptr> cuann_desc{ @@ -176,9 +169,6 @@ class IvfPqTest : public ::testing::TestWithParam { }(), [](ivf_pq::cuannIvfPqDescriptor_t d) { ivf_pq::cuannIvfPqDestroyDescriptor(d); }}; - CUANN_CHECK(ivf_pq::cuannSetDevice(cuann_handle.get(), handle_.get_device())); - CUANN_CHECK(ivf_pq::cuannSetStream(cuann_handle.get(), handle_.get_stream())); - // Number of kmeans clusters. // // The number of vectors per cluster, or 'numDataset' / 'numClusters', @@ -244,7 +234,7 @@ class IvfPqTest : public ::testing::TestWithParam { dtype = CUDA_R_32F; } CUANN_CHECK(ivf_pq::cuannIvfPqBuildIndex( - cuann_handle.get(), + handle_, cuann_desc.get(), database.data(), // dataset database.data(), // ?kmeans? trainset @@ -284,17 +274,15 @@ class IvfPqTest : public ::testing::TestWithParam { uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 CUANN_CHECK(ivf_pq::cuannIvfPqSetSearchTuningParameters( cuann_desc.get(), internalDistanceDtype, smemLutDtype, preferredThreadBlockSize)); - // Maximum number of query vectors to search. - uint32_t maxQueries = 1000000; // Maximum number of query vectors to search at the same time. - uint32_t batchSize = maxQueries; + uint32_t batchSize = std::min(ps.num_queries, 32768); // Maximum device memory size that may be used as workspace at search time. // maxSearchWorkspaceSize = 0; // default size_t maxSearchWorkspaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GiB // Allocate memory for index size_t ivf_pq_search_workspace_size; - CUANN_CHECK(ivf_pq::cuannIvfPqSearch_bufferSize(cuann_handle.get(), + CUANN_CHECK(ivf_pq::cuannIvfPqSearch_bufferSize(handle_, cuann_desc.get(), ivf_pq_index_buf_managed.data(), batchSize, @@ -303,7 +291,7 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); // finally, search! - CUANN_CHECK(cuannIvfPqSearch(cuann_handle.get(), + CUANN_CHECK(cuannIvfPqSearch(handle_, cuann_desc.get(), ivf_pq_index_buf_managed.data(), search_queries.data(), From f12e5da5966cc7c245c78865ba57a1e4591ddff6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 09:54:03 +0200 Subject: [PATCH 005/140] Reset the expected cuda error to cudaSuccess to not crumble following cuda calls --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index ad849b7ba3..562b8e5b27 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -1084,7 +1084,6 @@ inline void _cuann_kmeans_predict(const handle_t& handle, device_memory); stream.synchronize(); - if ((tempCenters != NULL) && (clusterSize != NULL)) { // accumulate _cuann_accumulate_with_label( @@ -6934,7 +6933,7 @@ inline void ivfpq_search(const handle_t& handle, desc->dimPq); constexpr size_t thresholdSmem = 48 * 1024; size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); - size_t sizeSmemBaseDiff = sizeof(float) * desc->dimDataset; + size_t sizeSmemBaseDiff = sizeof(float) * desc->dimRotDataset; uint32_t numCTAs = numQueries * desc->numProbes; int numThreads = 1024; @@ -6963,6 +6962,9 @@ inline void ivfpq_search(const handle_t& handle, cudaError = cudaFuncSetAttribute( kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem); if (cudaError != cudaSuccess) { + RAFT_EXPECTS( + cudaError == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); kernel_no_basediff_available = false; // Use "kernel_no_smem_lut" which just uses small amount of shared memory. @@ -6978,7 +6980,12 @@ inline void ivfpq_search(const handle_t& handle, if (sizeSmem + sizeSmemBaseDiff > thresholdSmem) { cudaError = cudaFuncSetAttribute( kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem + sizeSmemBaseDiff); - if (cudaError != cudaSuccess) { kernel_fast_available = false; } + if (cudaError != cudaSuccess) { + RAFT_EXPECTS( + cudaError == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); + kernel_fast_available = false; + } } #if 0 fprintf( stderr, From 12fad922817a4adf7b29ece41eef1bf2235a0aed Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 12:52:47 +0200 Subject: [PATCH 006/140] Replace cuann return codes with raft exceptions --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 954 +++++++++--------------- cpp/test/spatial/ann_ivf_pq.cu | 50 +- 2 files changed, 367 insertions(+), 637 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 562b8e5b27..467cb59c87 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -119,21 +119,6 @@ extern __shared__ float smemArray[]; #define FP16_MAX 65504.0 -/* CUANN status type */ -typedef enum { - CUANN_STATUS_SUCCESS = 0, - CUANN_STATUS_ALLOC_FAILED = 1, - CUANN_STATUS_NOT_INITIALIZED = 2, - CUANN_STATUS_INVALID_VALUE = 3, - CUANN_STATUS_INTERNAL_ERROR = 4, - CUANN_STATUS_FILEIO_ERROR = 5, - CUANN_STATUS_CUDA_ERROR = 6, - CUANN_STATUS_CUBLAS_ERROR = 7, - CUANN_STATUS_INVALID_POINTER = 8, - CUANN_STATUS_VERSION_ERROR = 9, - CUANN_STATUS_UNSUPPORTED_DTYPE = 10, -} cuannStatus_t; - /* CUANN similarity type */ typedef enum { CUANN_SIMILARITY_INNER = 0, @@ -3239,10 +3224,10 @@ template __global__ void ivfpq_make_outputs( * */ -inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); -inline cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); +inline void cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); +inline void cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); -inline cuannStatus_t cuannIvfPqSetIndexParameters( +inline void cuannIvfPqSetIndexParameters( cuannIvfPqDescriptor_t desc, const uint32_t numClusters, /* Number of clusters */ const uint32_t numDataset, /* Number of dataset entries */ @@ -3252,19 +3237,19 @@ inline cuannStatus_t cuannIvfPqSetIndexParameters( const cuannSimilarity_t similarity, const cuannPqCenter_t typePqCenter); -inline cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - cuannSimilarity_t* similarity, - cuannPqCenter_t* typePqCenter); +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter); -inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, - size_t* size /* bytes of dataset index */); +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, + size_t* size /* bytes of dataset index */); -inline cuannStatus_t cuannIvfPqBuildIndex( +inline void cuannIvfPqBuildIndex( const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* dataset, /* [numDataset, dimDataset] */ @@ -3276,75 +3261,74 @@ inline cuannStatus_t cuannIvfPqBuildIndex( bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ void* index /* database index to build */); -inline cuannStatus_t cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const char* fileName); +inline void cuannIvfPqSaveIndex(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName); -inline cuannStatus_t cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - void** index, - const char* fileName); +inline void cuannIvfPqLoadIndex(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName); -inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const handle_t& handle, const char* oldIndexFileName, const char* newIndexFileName, const void* newVectors, /* [numVectorsToAdd, dimDataset] */ uint32_t numNewVectors); -inline cuannStatus_t cuannIvfPqSetSearchParameters( +inline void cuannIvfPqSetSearchParameters( cuannIvfPqDescriptor_t desc, const uint32_t numProbes, /* Number of clusters to probe */ const uint32_t topK); /* Number of search results */ -inline cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize); - -inline cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numProbes, - uint32_t* topK); - -inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize); - -inline cuannStatus_t cuannIvfPqSearch_bufferSize(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - uint32_t numQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize); - -inline cuannStatus_t cuannIvfPqSearch(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, +inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize); + +inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK); + +inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize); + +inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t numQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize); + +inline void cuannIvfPqSearch(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const void* queries, /* [numQueries, dimDataset] */ + cudaDataType_t dtype, + uint32_t numQueries, + uint64_t* neighbors, /* [numQueries, topK] */ + float* distances, /* [numQueries, topK] */ + void* workspace); + +inline void cuannPostprocessingRefine(uint32_t numDataset, + uint32_t numQueries, + uint32_t dimDataset, + const void* dataset, /* [numDataset, dimDataset] */ const void* queries, /* [numQueries, dimDataset] */ cudaDataType_t dtype, - uint32_t numQueries, - uint64_t* neighbors, /* [numQueries, topK] */ - float* distances, /* [numQueries, topK] */ - void* workspace); - -inline cuannStatus_t cuannPostprocessingRefine( - uint32_t numDataset, - uint32_t numQueries, - uint32_t dimDataset, - const void* dataset, /* [numDataset, dimDataset] */ - const void* queries, /* [numQueries, dimDataset] */ - cudaDataType_t dtype, - cuannSimilarity_t similarity, - uint32_t topK, - const uint64_t* neighbors, /* [numQueries, topK] */ - uint32_t refinedTopK, - uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ - float* refinedDistances /* [numQueries, refinedTopK] */ + cuannSimilarity_t similarity, + uint32_t topK, + const uint64_t* neighbors, /* [numQueries, topK] */ + uint32_t refinedTopK, + uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ + float* refinedDistances /* [numQueries, refinedTopK] */ ); -inline cuannStatus_t cuannPostprocessingMerge( +inline void cuannPostprocessingMerge( uint32_t numSplit, uint32_t numQueries, uint32_t topK, @@ -3826,33 +3810,28 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, // // Rotate the residual vectors using a rotation matrix // - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); - float alpha = 1.0; - float beta = 0.0; - cublasStatus_t cublasError = cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - dimRotDataset, - clusterSize[l], - dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - dimDataset, - resVectors[devId], - CUDA_R_32F, - dimDataset, - &beta, - rotVectors[devId], - CUDA_R_32F, - dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); - // return CUANN_STATUS_CUBLAS_ERROR; - exit(-1); - } + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); + float alpha = 1.0; + float beta = 0.0; + RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + dimRotDataset, + clusterSize[l], + dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + dimDataset, + resVectors[devId], + CUDA_R_32F, + dimDataset, + &beta, + rotVectors[devId], + CUDA_R_32F, + dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); // @@ -3983,10 +3962,10 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, } // cuannIvfPqCreateDescriptor -inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) +inline void cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) { *desc = (cuannIvfPqDescriptor_t)malloc(sizeof(struct cuannIvfPqDescriptor)); - if (*desc == NULL) { return CUANN_STATUS_ALLOC_FAILED; } + RAFT_EXPECTS(*desc != nullptr, "cuann allocation failed"); (*desc)->numClusters = 0; (*desc)->numDataset = 0; (*desc)->dimDataset = 0; @@ -4001,92 +3980,46 @@ inline cuannStatus_t cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) (*desc)->maxSamples = 0; (*desc)->inclusiveSumSortedClusterSize = NULL; (*desc)->sqsumClusters = NULL; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqDestroyDescriptor -inline cuannStatus_t cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) +inline void cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); if (desc->sqsumClusters != NULL) { cudaFree(desc->sqsumClusters); } free(desc); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSetIndexParameters -inline cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, - const uint32_t numClusters, - const uint32_t numDataset, - const uint32_t dimDataset, - const uint32_t dimPq, - const uint32_t bitPq, - const cuannSimilarity_t similarity, - const cuannPqCenter_t typePqCenter) +inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numClusters, + const uint32_t numDataset, + const uint32_t dimDataset, + const uint32_t dimPq, + const uint32_t bitPq, + const cuannSimilarity_t similarity, + const cuannPqCenter_t typePqCenter) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - if (numClusters == 0) { - fprintf( - stderr, "(%s) numClusters must be larger than zero (dimDataset:%u).\n", __func__, dimDataset); - return CUANN_STATUS_INVALID_VALUE; - } - if (numDataset == 0) { - fprintf( - stderr, "(%s) numDataset must be larger than zero (numDataset:%u).\n", __func__, numDataset); - return CUANN_STATUS_INVALID_VALUE; - } - if (dimDataset == 0) { - fprintf( - stderr, "(%s) dimDataset must be larger than zero (dimDataset:%u).\n", __func__, dimDataset); - return CUANN_STATUS_INVALID_VALUE; - } - if (dimPq == 0) { - fprintf(stderr, "(%s) dimPq must be larger than zero (dimPq:%u).\n", __func__, dimPq); - return CUANN_STATUS_INVALID_VALUE; - } - if (numClusters > numDataset) { - fprintf(stderr, - "(%s) numClusters must be smaller than numDataset (numClusters:%u, numDataset:%u).\n", - __func__, - numClusters, - numDataset); - return CUANN_STATUS_INVALID_VALUE; - } - if (bitPq < 4 || bitPq > 8) { - fprintf(stderr, "(%s) bitPq must be 4, 5, 6, 7 or 8 (bitPq:%u)\n", __func__, bitPq); - return CUANN_STATUS_INVALID_VALUE; - } - if (bitPq == 4 && dimPq % 2 != 0) { - fprintf(stderr, - "(%s) dimPq must be multiple of 2 when bitPq is 4 (dimPq:%u, bitPq:%u)\n", - __func__, - dimPq, - bitPq); - return CUANN_STATUS_INVALID_VALUE; - } - if (bitPq == 5 && dimPq % 8 != 0) { - fprintf(stderr, - "(%s) dimPq must be multiple of 8 when bitPq is 5 (dimPq:%u, bitPq:%u)\n", - __func__, - dimPq, - bitPq); - return CUANN_STATUS_INVALID_VALUE; - } - if (bitPq == 6 && dimPq % 4 != 0) { - fprintf(stderr, - "(%s) dimPq must be multiple of 4 when bitPq is 6 (dimPq:%u, bitPq:%u)\n", - __func__, - dimPq, - bitPq); - return CUANN_STATUS_INVALID_VALUE; - } - if (bitPq == 7 && dimPq % 8 != 0) { - fprintf(stderr, - "(%s) dimPq must be multiple of 8 when bitPq is 7 (dimPq:%u, bitPq:%u)\n", - __func__, - dimPq, - bitPq); - return CUANN_STATUS_INVALID_VALUE; - } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(numClusters > 0, "(%s) numClusters must be larger than zero.", __func__); + RAFT_EXPECTS(numDataset > 0, "(%s) numDataset must be larger than zero.", __func__); + RAFT_EXPECTS(dimDataset > 0, "(%s) dimDataset must be larger than zero.", __func__); + RAFT_EXPECTS(dimPq > 0, "(%s) dimPq must be larger than zero.", __func__); + RAFT_EXPECTS(numClusters <= numDataset, + "(%s) numClusters must be smaller than numDataset (numClusters:%u, numDataset:%u).", + __func__, + numClusters, + numDataset); + RAFT_EXPECTS(bitPq >= 4 && bitPq <= 8, + "(%s) bitPq must be within closed range [4,8], but got %u.", + __func__, + bitPq); + RAFT_EXPECTS((bitPq * dimPq) % 8 == 0, + "(%s) `bitPq * dimPq` must be a multiple of 8, but got %u * %u = %u.", + __func__, + bitPq, + dimPq, + bitPq * dimPq); desc->numClusters = numClusters; desc->numDataset = numDataset; desc->dimDataset = dimDataset; @@ -4102,20 +4035,19 @@ inline cuannStatus_t cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, desc->dimRotDataset = dimDataset; if (dimDataset % dimPq) { desc->dimRotDataset = ((dimDataset / dimPq) + 1) * dimPq; } desc->lenPq = desc->dimRotDataset / dimPq; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqGetIndexParameters -inline cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - cuannSimilarity_t* similarity, - cuannPqCenter_t* typePqCenter) +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + cuannSimilarity_t* similarity, + cuannPqCenter_t* typePqCenter) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); *numClusters = desc->numClusters; *numDataset = desc->numDataset; @@ -4124,13 +4056,12 @@ inline cuannStatus_t cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, *bitPq = desc->bitPq; *similarity = desc->similarity; *typePqCenter = desc->typePqCenter; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqGetIndexSize -inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); *size = sizeof(struct cuannIvfPqIndexHeader); if (*size != 1024) { @@ -4144,47 +4075,38 @@ inline cuannStatus_t cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* *size += _cuann_getIndexSize_indexPtr(desc); *size += _cuann_getIndexSize_rotationMatrix(desc); *size += _cuann_getIndexSize_clusterRotCenters(desc); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqBuildIndex -inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* dataset, - const void* trainset, - cudaDataType_t dtype, - uint32_t numTrainset, - uint32_t numIterations, - bool randomRotation, - bool hierarchicalClustering, - void* index) +inline void cuannIvfPqBuildIndex(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* dataset, + const void* trainset, + cudaDataType_t dtype, + uint32_t numTrainset, + uint32_t numIterations, + bool randomRotation, + bool hierarchicalClustering, + void* index) { int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); - if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { - return CUANN_STATUS_UNSUPPORTED_DTYPE; - } - if (desc->similarity == CUANN_SIMILARITY_INNER && dtype != CUDA_R_32F) { - fprintf( - stderr, "(%s, %d) CUANN_SIMILARITY_INNER supports float dtype only.\n", __func__, __LINE__); - return CUANN_STATUS_UNSUPPORTED_DTYPE; + RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, + "Unsupported dtype"); + if (desc->similarity == CUANN_SIMILARITY_INNER) { + RAFT_EXPECTS(dtype == CUDA_R_32F, + "Unsupported dtype (inner-product metric support float only)"); } + desc->dtypeDataset = dtype; char dtypeString[64]; fprintf(stderr, "# dtypeDataset: %s\n", _cuann_get_dtype_string(desc->dtypeDataset, dtypeString)); - cudaError_t cudaError; - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dataset); - if (attr.type == cudaMemoryTypeDevice) { - fprintf(stderr, "(%s) dataset must be accessible from the host.\n", __func__); - return CUANN_STATUS_INVALID_POINTER; - } - cudaPointerGetAttributes(&attr, trainset); - if (attr.type == cudaMemoryTypeDevice) { - fprintf(stderr, "(%s) trainset must be accessible from the host.\n", __func__); - return CUANN_STATUS_INVALID_POINTER; + switch (detail::utils::check_pointer_residency(dataset, trainset)) { + case detail::utils::pointer_residency::host_only: + case detail::utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("both dataset and trainsed must be accessible from the host."); } struct cuannIvfPqIndexHeader* header; @@ -4208,26 +4130,14 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, &clusterRotCenters); uint32_t* trainsetLabels; // [numTrainset] - cudaError = cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset)); uint32_t* clusterSize; // [numClusters] - cudaError = cudaMallocManaged(&clusterSize, sizeof(uint32_t) * desc->numClusters); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * desc->numClusters)); float* clusterCentersTemp; // [numClusters, dimDataset] - cudaError = - cudaMallocManaged(&clusterCentersTemp, sizeof(float) * desc->numClusters * desc->dimDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY( + cudaMallocManaged(&clusterCentersTemp, sizeof(float) * desc->numClusters * desc->dimDataset)); uint32_t** wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); @@ -4241,33 +4151,18 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters); float* mesoClusterCenters; // [numMesoClusters, dimDataset] - cudaError = - cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * desc->dimDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY( + cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * desc->dimDataset)); + float* mesoClusterCentersTemp; // [numMesoClusters, dimDataset] - cudaError = cudaMallocManaged(&mesoClusterCentersTemp, - sizeof(float) * numMesoClusters * desc->dimDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterCentersTemp, + sizeof(float) * numMesoClusters * desc->dimDataset)); uint32_t* mesoClusterLabels; // [numTrainset,] - cudaError = cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset)); uint32_t* mesoClusterSize; // [numMesoClusters,] - cudaError = cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters)); // // Training kmeans for meso-clusters @@ -4477,10 +4372,10 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, _cuann_multi_device_free(clusterSizeMP, 1); _cuann_multi_device_free((uint8_t**)predictWorkspace, 1); - cudaFree(mesoClusterSize); - cudaFree(mesoClusterLabels); - cudaFree(mesoClusterCenters); - cudaFree(mesoClusterCentersTemp); + RAFT_CUDA_TRY(cudaFree(mesoClusterSize)); + RAFT_CUDA_TRY(cudaFree(mesoClusterLabels)); + RAFT_CUDA_TRY(cudaFree(mesoClusterCenters)); + RAFT_CUDA_TRY(cudaFree(mesoClusterCentersTemp)); free(numFineClusters); free(csumFineClusters); @@ -4568,11 +4463,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, } uint32_t* datasetLabels; // [numDataset] - cudaError = cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * desc->numDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * desc->numDataset)); // // Predict labels of whole dataset (with multiple GPUs) @@ -4604,32 +4495,28 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); // Rotate clusterCenters - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); - float alpha = 1.0; - float beta = 0.0; - cublasStatus_t cublasError = cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->dimRotDataset, - desc->numClusters, - desc->dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - desc->dimDataset, - clusterCenters, - CUDA_R_32F, - desc->dimDataset, - &beta, - clusterRotCenters, - CUDA_R_32F, - desc->dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUBLAS_ERROR; - } + cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); + float alpha = 1.0; + float beta = 0.0; + RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->dimRotDataset, + desc->numClusters, + desc->dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + desc->dimDataset, + clusterCenters, + CUDA_R_32F, + desc->dimDataset, + &beta, + clusterRotCenters, + CUDA_R_32F, + desc->dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); // @@ -4642,10 +4529,7 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; if (maxClusterSize < clusterSize[l]) { maxClusterSize = clusterSize[l]; } } - if (indexPtr[desc->numClusters] != desc->numDataset) { - fprintf(stderr, "(%s, %d) Unexpected Error.\n", __func__, __LINE__); - return CUANN_STATUS_INTERNAL_ERROR; - } + RAFT_EXPECTS(indexPtr[desc->numClusters] == desc->numDataset, "Cluster sizes do not add up"); desc->maxClusterSize = maxClusterSize; // fprintf(stderr, "(%s) maxClusterSize: %u\n", __func__, maxClusterSize); @@ -4861,12 +4745,8 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, // combine clusterCenters and sqsumClusters cudaDeviceSynchronize(); float* tmpClusterCenters; // [numClusters, dimDataset] - cudaError = - cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY( + cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset)); for (uint32_t i = 0; i < desc->numClusters * desc->dimDataset; i++) { tmpClusterCenters[i] = clusterCenters[i]; } @@ -4896,10 +4776,10 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, header->numDatasetAdded = 0; // - cudaFree(clusterSize); - cudaFree(trainsetLabels); - cudaFree(datasetLabels); - cudaFree(clusterCentersTemp); + RAFT_CUDA_TRY(cudaFree(clusterSize)); + RAFT_CUDA_TRY(cudaFree(trainsetLabels)); + RAFT_CUDA_TRY(cudaFree(datasetLabels)); + RAFT_CUDA_TRY(cudaFree(clusterCentersTemp)); _cuann_multi_device_free(wsKAC, 1); _cuann_multi_device_free(pqCentersTemp, 1); @@ -4907,63 +4787,50 @@ inline cuannStatus_t cuannIvfPqBuildIndex(const handle_t& handle, _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); _cuann_set_device(callerDevId); - - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSaveIndex -inline cuannStatus_t cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const char* fileName) +inline void cuannIvfPqSaveIndex(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const char* fileName) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); int orgDevId = _cuann_set_device(handle.get_device()); FILE* fp = fopen(fileName, "w"); - if (fp == NULL) { - fprintf(stderr, "(%s) failed to open file (%s).\n", __func__, fileName); - return CUANN_STATUS_FILEIO_ERROR; - } + RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)index; fprintf(stderr, "(%s) indexSize: %lu\n", __func__, header->indexSize); if (fwrite(index, 1, header->indexSize, fp) != header->indexSize) { - fprintf(stderr, "(%s) failed to save index to file (%s)\n", __func__, fileName); - return CUANN_STATUS_FILEIO_ERROR; + RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); } fclose(fp); _cuann_set_device(orgDevId); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqLoadIndex -inline cuannStatus_t cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - void** index, - const char* fileName) +inline void cuannIvfPqLoadIndex(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + void** index, + const char* fileName) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); int orgDevId = _cuann_set_device(handle.get_device()); if (1 /* *index == NULL */) { FILE* fp = fopen(fileName, "r"); - if (fp == NULL) { - fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName); - return CUANN_STATUS_FILEIO_ERROR; - } + RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); + size_t indexSize; fread(&indexSize, sizeof(size_t), 1, fp); fprintf(stderr, "(%s) indexSize: %lu\n", __func__, indexSize); - cudaError_t cudaError = cudaMallocManaged(index, indexSize); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s) cudaMallocManaged() failed.\n", __func__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(index, indexSize)); fseek(fp, 0, SEEK_SET); if (fread(*index, 1, indexSize, fp) != indexSize) { - fprintf(stderr, "(%s) failed to load index to from file (%s)\n", __func__, fileName); - return CUANN_STATUS_FILEIO_ERROR; + RAFT_FAIL("(%s) failed to load index to from file (%s)\n", __func__, fileName); } fclose(fp); @@ -5042,24 +4909,20 @@ inline cuannStatus_t cuannIvfPqLoadIndex(const handle_t& handle, } _cuann_set_device(orgDevId); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex -inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const handle_t& handle, const char* oldIndexFileName, const char* newIndexFileName, const void* newVectors, /* [numNewVectors, dimDataset] */ uint32_t numNewVectors) { - cudaError_t cudaError; - cuannStatus_t ret; - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, newVectors); - if (attr.type == cudaMemoryTypeDevice) { - fprintf(stderr, "(%s, %d) newVectors must be accessible from the host.\n", __func__, __LINE__); - return CUANN_STATUS_INVALID_POINTER; + switch (detail::utils::check_pointer_residency(newVectors)) { + case detail::utils::pointer_residency::host_only: + case detail::utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("newVectors must be accessible from the host."); } int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); @@ -5068,11 +4931,9 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // Load old index // cuannIvfPqDescriptor_t oldDesc; - ret = cuannIvfPqCreateDescriptor(&oldDesc); - if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cuannIvfPqCreateDescriptor(&oldDesc); void* oldIndex; - ret = cuannIvfPqLoadIndex(handle, oldDesc, &oldIndex, oldIndexFileName); - if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cuannIvfPqLoadIndex(handle, oldDesc, &oldIndex, oldIndexFileName); cudaDataType_t dtype = oldDesc->dtypeDataset; char dtypeString[64]; fprintf(stderr, "(%s) dtype: %s\n", __func__, _cuann_get_dtype_string(dtype, dtypeString)); @@ -5103,12 +4964,8 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // are extracted. // float* clusterCenters; // [numClusters, dimDataset] - cudaError = - cudaMallocManaged(&clusterCenters, sizeof(float) * oldDesc->numClusters * oldDesc->dimDataset); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY( + cudaMallocManaged(&clusterCenters, sizeof(float) * oldDesc->numClusters * oldDesc->dimDataset)); for (uint32_t i = 0; i < oldDesc->numClusters; i++) { memcpy(clusterCenters + (uint64_t)i * oldDesc->dimDataset, oldClusterCenters + (uint64_t)i * oldDesc->dimDatasetExt, @@ -5120,18 +4977,10 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // of the vector to be added. // uint32_t* newVectorLabels; // [numNewVectors,] - cudaError = cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors)); cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors); uint32_t* clusterSize; // [numClusters,] - cudaError = cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters)); cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters); fprintf(stderr, "(%s) Predict label of new vectors\n", __func__); _cuann_kmeans_predict_MP(handle, @@ -5193,10 +5042,7 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; maxClusterSize = max(maxClusterSize, clusterSize[l]); } - if (indexPtr[oldDesc->numClusters] != numNewVectors) { - fprintf(stderr, "(%s, %d) Unexpected Error.\n", __func__, __LINE__); - return CUANN_STATUS_INTERNAL_ERROR; - } + RAFT_EXPECTS(indexPtr[oldDesc->numClusters] == numNewVectors, "cluster sizes do not add up."); // originalNumbers for (uint32_t i = 0; i < numNewVectors; i++) { uint32_t l = newVectorLabels[i]; @@ -5212,12 +5058,8 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // Compute PQ code for new vectors // uint8_t* pqDataset; // [numNewVectors, dimPq * bitPq / 8] - cudaError = cudaMallocManaged( - &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - return CUANN_STATUS_ALLOC_FAILED; - } + RAFT_CUDA_TRY(cudaMallocManaged( + &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8)); _cuann_compute_PQ_code(handle, numNewVectors, oldDesc->dimDataset, @@ -5244,8 +5086,7 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // Create descriptor for new index // cuannIvfPqDescriptor_t newDesc; - ret = cuannIvfPqCreateDescriptor(&newDesc); - if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cuannIvfPqCreateDescriptor(&newDesc); memcpy(newDesc, oldDesc, sizeof(struct cuannIvfPqDescriptor)); newDesc->numDataset += numNewVectors; fprintf( @@ -5255,8 +5096,7 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // Allocate memory for new index // size_t newIndexSize; - ret = cuannIvfPqGetIndexSize(newDesc, &newIndexSize); - if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cuannIvfPqGetIndexSize(newDesc, &newIndexSize); fprintf(stderr, "(%s) indexSize: %lu -> %lu\n", __func__, oldHeader->indexSize, newIndexSize); void* newIndex = malloc(newIndexSize); memset(newIndex, 0, newIndexSize); @@ -5350,8 +5190,7 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // // Save new index // - ret = cuannIvfPqSaveIndex(handle, newDesc, newIndex, newIndexFileName); - if (ret != CUANN_STATUS_SUCCESS) { return ret; } + cuannIvfPqSaveIndex(handle, newDesc, newIndex, newIndexFileName); if (newHeader->numDatasetAdded * 2 >= newHeader->numDataset) { fprintf(stderr, "(%s) The total number of vectors in the new index" @@ -5373,49 +5212,32 @@ inline cuannStatus_t cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( free(indexPtr); free(newIndex); - cudaFree(pqDataset); - cudaFree(clusterSize); - cudaFree(newVectorLabels); - cudaFree(clusterCenters); - cudaFree(oldIndex); + RAFT_CUDA_TRY(cudaFree(pqDataset)); + RAFT_CUDA_TRY(cudaFree(clusterSize)); + RAFT_CUDA_TRY(cudaFree(newVectorLabels)); + RAFT_CUDA_TRY(cudaFree(clusterCenters)); + RAFT_CUDA_TRY(cudaFree(oldIndex)); _cuann_set_device(callerDevId); - - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSetSearchParameters -inline cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, - const uint32_t numProbes, - const uint32_t topK) +inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, + const uint32_t numProbes, + const uint32_t topK) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - if (numProbes == 0) { - fprintf( - stderr, "(%s) numProbes must be larger than zero (numProbes:%u).\n", __func__, numProbes); - return CUANN_STATUS_INVALID_VALUE; - } - if (topK == 0) { - fprintf(stderr, "(%s) topK must be larger than zero (topK:%u).\n", __func__, topK); - return CUANN_STATUS_INVALID_VALUE; - } - if (numProbes > desc->numClusters) { - fprintf(stderr, - "(%s) numProbes must be smaller than or equal to numClusters (numProbes:%u, " - "numClusters:%u).\n", - __func__, - numProbes, - desc->numClusters); - return CUANN_STATUS_INVALID_VALUE; - } - if (topK > desc->numDataset) { - fprintf(stderr, - "(%s) topK must be smaller than or equal to numDataset (topK:%u, numDataset:%u).\n", - __func__, - topK, - desc->numDataset); - return CUANN_STATUS_INVALID_VALUE; - } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(numProbes > 0, "numProbes must be larger than zero"); + RAFT_EXPECTS(topK > 0, "topK must be larger than zero"); + RAFT_EXPECTS(numProbes <= desc->numClusters, + "numProbes (%u) must be not larger than numClusters (%u)", + numProbes, + desc->numClusters); + RAFT_EXPECTS(topK <= desc->numDataset, + "topK (%u) must be not larger than numDataset (%u)", + numProbes, + desc->numDataset); + uint32_t numSamplesWorstCase = desc->numDataset; if (numProbes < desc->numClusters) { numSamplesWorstCase = @@ -5424,16 +5246,12 @@ inline cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, desc->_numClustersSize0]; // (*) urgent WA, need to be // fixed. } - if (topK > numSamplesWorstCase) { - fprintf(stderr, - "(%s) numProbes is too small to get topK results reliably (numProbes:%u, topK:%u, " - "numSamplesWorstCase:%u).\n", - __func__, - numProbes, - topK, - numSamplesWorstCase); - return CUANN_STATUS_INVALID_VALUE; - } + RAFT_EXPECTS(topK <= numSamplesWorstCase, + "numProbes is too small to get topK results reliably (numProbes: %u, topK: %u, " + "numSamplesWorstCase: %u).", + numProbes, + topK, + numSamplesWorstCase); desc->numProbes = numProbes; desc->topK = topK; if (0) { @@ -5447,33 +5265,24 @@ inline cuannStatus_t cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, desc->smemLutDtype = CUDA_R_32F; desc->preferredThreadBlockSize = 0; // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSetSearchParameters -inline cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize) +inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } - if (internalDistanceDtype != CUDA_R_16F && internalDistanceDtype != CUDA_R_32F) { - fprintf( - stderr, "(%s) internalDistanceDtype must be either CUDA_R_16F or CUDA_R_32F\n", __func__); - return CUANN_STATUS_UNSUPPORTED_DTYPE; - } - if (smemLutDtype != CUDA_R_16F && smemLutDtype != CUDA_R_32F && smemLutDtype != CUDA_R_8U) { - fprintf(stderr, "(%s) smemLutDtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U\n", __func__); - return CUANN_STATUS_UNSUPPORTED_DTYPE; - } - if (preferredThreadBlockSize != 256 && preferredThreadBlockSize != 512 && - preferredThreadBlockSize != 1024 && preferredThreadBlockSize != 0) { - fprintf(stderr, - "(%s) preferredThreadBlockSize must be 0, 256, 512 or 1024. %u is given.\n", - __func__, - preferredThreadBlockSize); - return CUANN_STATUS_UNSUPPORTED_DTYPE; - } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(internalDistanceDtype == CUDA_R_16F || internalDistanceDtype == CUDA_R_32F, + "internalDistanceDtype must be either CUDA_R_16F or CUDA_R_32F"); + RAFT_EXPECTS( + smemLutDtype == CUDA_R_16F || smemLutDtype == CUDA_R_32F || smemLutDtype == CUDA_R_8U, + "smemLutDtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U"); + RAFT_EXPECTS(preferredThreadBlockSize == 256 || preferredThreadBlockSize == 512 || + preferredThreadBlockSize == 1024 || preferredThreadBlockSize == 0, + "preferredThreadBlockSize must be 0, 256, 512 or 1024, but %u is given.", + preferredThreadBlockSize); desc->internalDistanceDtype = internalDistanceDtype; desc->smemLutDtype = smemLutDtype; if (0) { @@ -5484,42 +5293,39 @@ inline cuannStatus_t cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t } desc->preferredThreadBlockSize = preferredThreadBlockSize; // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); - return CUANN_STATUS_SUCCESS; } // cuannIvfPqGetSearchParameters -inline cuannStatus_t cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numProbes, - uint32_t* topK) +inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, + uint32_t* numProbes, + uint32_t* topK) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); *numProbes = desc->numProbes; *topK = desc->topK; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqGetSearchTuningParameters -inline cuannStatus_t cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize) +inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); *internalDistanceDtype = desc->internalDistanceDtype; *smemLutDtype = desc->smemLutDtype; *preferredThreadBlockSize = desc->preferredThreadBlockSize; - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSearch -inline cuannStatus_t cuannIvfPqSearch_bufferSize(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - uint32_t maxQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize) +inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + uint32_t maxQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); size_t max_ws = maxWorkspaceSize; if (max_ws == 0) { @@ -5596,12 +5402,10 @@ inline cuannStatus_t cuannIvfPqSearch_bufferSize(const handle_t& handle, *workspaceSize, (float)*workspaceSize / 1024 / 1024 / 1024); #endif - - return CUANN_STATUS_SUCCESS; } // cuannIvfPqSearch -inline cuannStatus_t cuannIvfPqSearch( +inline void cuannIvfPqSearch( const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, @@ -5612,12 +5416,11 @@ inline cuannStatus_t cuannIvfPqSearch( float* distances, // [numQueries, topK], device pointer void* workspace) { - if (desc == NULL) { return CUANN_STATUS_NOT_INITIALIZED; } + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); int orgDevId = _cuann_set_device(handle.get_device()); - if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { - return CUANN_STATUS_UNSUPPORTED_DTYPE; - } + RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, + "unsupported dtype"); struct cuannIvfPqIndexHeader* header; float* clusterCenters; // [numClusters, dimDatasetExt] @@ -5694,23 +5497,14 @@ inline cuannStatus_t cuannIvfPqSearch( } } - cublasStatus_t cublasError; - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, neighbors); - if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - fprintf(stderr, "(%s) neighbors must be accessible from the device.\n", __func__); - return CUANN_STATUS_INVALID_POINTER; - } - cudaPointerGetAttributes(&attr, distances); - if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - fprintf(stderr, "(%s) distances must be accessible from the device.\n", __func__); - return CUANN_STATUS_INVALID_POINTER; + switch (detail::utils::check_pointer_residency(neighbors, distances)) { + case detail::utils::pointer_residency::device_only: + case detail::utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("output pointers must be accessible from the device."); } - cudaPointerGetAttributes(&attr, queries); -#ifdef CUANN_DEBUG - cudaError_t cudaError; -#endif + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, queries); for (uint32_t i = 0; i < numQueries; i += desc->maxQueries) { uint32_t nQueries = min(desc->maxQueries, numQueries - i); @@ -5794,56 +5588,48 @@ inline cuannStatus_t cuannIvfPqSearch( gemmK = desc->dimDataset + 1; assert(gemmK <= desc->dimDatasetExt); } - cublasError = cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->numClusters, - nQueries, - gemmK, - &alpha, - clusterCenters, - CUDA_R_32F, - desc->dimDatasetExt, - curQueries, - CUDA_R_32F, - desc->dimDatasetExt, - &beta, - QCDistances, - CUDA_R_32F, - desc->numClusters, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUBLAS_ERROR; - } + RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->numClusters, + nQueries, + gemmK, + &alpha, + clusterCenters, + CUDA_R_32F, + desc->dimDatasetExt, + curQueries, + CUDA_R_32F, + desc->dimDatasetExt, + &beta, + QCDistances, + CUDA_R_32F, + desc->numClusters, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Rotate queries - alpha = 1.0; - beta = 0.0; - cublasError = cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->dimRotDataset, - nQueries, - desc->dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - desc->dimDataset, - curQueries, - CUDA_R_32F, - desc->dimDatasetExt, - &beta, - rotQueries, - CUDA_R_32F, - desc->dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGemmEx() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUBLAS_ERROR; - } + alpha = 1.0; + beta = 0.0; + RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + desc->dimRotDataset, + nQueries, + desc->dimDataset, + &alpha, + rotationMatrix, + CUDA_R_32F, + desc->dimDataset, + curQueries, + CUDA_R_32F, + desc->dimDatasetExt, + &beta, + rotQueries, + CUDA_R_32F, + desc->dimRotDataset, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Select neighbor clusters for each query. _cuann_find_topk(handle, @@ -5855,14 +5641,7 @@ inline cuannStatus_t cuannIvfPqSearch( clusterLabelsToProbe, topkWorkspace, false); -#ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } -#endif - // + for (uint32_t j = 0; j < nQueries; j += desc->maxBatchSize) { uint32_t batchSize = min(desc->maxBatchSize, nQueries - j); _ivfpq_search(handle, @@ -5878,28 +5657,10 @@ inline cuannStatus_t cuannIvfPqSearch( neighbors + ((uint64_t)(desc->topK) * (i + j)), distances + ((uint64_t)(desc->topK) * (i + j)), searchWorkspace); -#ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf( - stderr, "(%s, %d) cudaDeviceSynchronize() failed (%d)\n", __func__, __LINE__, cudaError); - fprintf(stderr, "# i:%u, nQueries:%u, j:%u, batchSize:%u\n", i, nQueries, j, batchSize); - return CUANN_STATUS_CUDA_ERROR; - } -#endif } } -#ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - return CUANN_STATUS_CUDA_ERROR; - } -#endif - _cuann_set_device(orgDevId); - return CUANN_STATUS_SUCCESS; } // @@ -6792,11 +6553,7 @@ inline void ivfpq_search(const handle_t& handle, ivfpq_init_topkScores<<>>( topkScores, FLT_MAX, numQueries * desc->topK); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif } @@ -6806,11 +6563,7 @@ inline void ivfpq_search(const handle_t& handle, ivfpq_make_chunk_index_ptr<<>>( desc->numProbes, numQueries, indexPtr, clusterLabelsToProbe, chunkIndexPtr, numSamples); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif if (numQueries * desc->numProbes > 256) { @@ -6823,11 +6576,7 @@ inline void ivfpq_search(const handle_t& handle, ivfpq_prep_sort<<>>(numQueries * desc->numProbes, indexList); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif int begin_bit = 0; @@ -6843,16 +6592,7 @@ inline void ivfpq_search(const handle_t& handle, end_bit, handle.get_stream()); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } - if (0) { - for (uint32_t i = 0; i < numQueries * desc->numProbes; i++) { - fprintf(stderr, "# i:%u, index:%d, label:%u\n", i, indexListSorted[i], clusterLabelsOut[i]); - } - } + handle.sync_stream(); #endif } else { indexListSorted = NULL; @@ -7042,11 +6782,7 @@ inline void ivfpq_search(const handle_t& handle, (scoreDtype*)similarity, simTopkIndex); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif // Select topk vectors for each query @@ -7095,11 +6831,7 @@ inline void ivfpq_search(const handle_t& handle, topkNeighbors, topkDistances); #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif } diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index 145a7237bd..fa702a5bd5 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -108,8 +108,6 @@ auto eval_knn(const std::vector& expected_idx, return testing::AssertionSuccess(); } -#define CUANN_CHECK(ret) RAFT_EXPECTS(ret == ivf_pq::CUANN_STATUS_SUCCESS, "cuann failure: %d", ret) - template class IvfPqTest : public ::testing::TestWithParam { public: @@ -164,7 +162,7 @@ class IvfPqTest : public ::testing::TestWithParam { cuann_desc{ []() { ivf_pq::cuannIvfPqDescriptor_t d; - CUANN_CHECK(ivf_pq::cuannIvfPqCreateDescriptor(&d)); + ivf_pq::cuannIvfPqCreateDescriptor(&d); return d; }(), [](ivf_pq::cuannIvfPqDescriptor_t d) { ivf_pq::cuannIvfPqDestroyDescriptor(d); }}; @@ -209,7 +207,7 @@ class IvfPqTest : public ::testing::TestWithParam { : ivf_pq::CUANN_SIMILARITY_L2; // Specify whether PQ codebooks are created per subspace or per cluster. ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; - CUANN_CHECK(ivf_pq::cuannIvfPqSetIndexParameters( + ivf_pq::cuannIvfPqSetIndexParameters( cuann_desc.get(), n_clusters, /* Number of clusters */ uint32_t(ps.num_db_vecs), /* Number of dataset entries */ @@ -217,11 +215,11 @@ class IvfPqTest : public ::testing::TestWithParam { dimPq, /* Dimension of each entry after product quantization */ bitPq, /* Bit length of PQ */ similarity, - typePqCenter)); + typePqCenter); // Allocate memory for index size_t ivf_pq_index_size; - CUANN_CHECK(ivf_pq::cuannIvfPqGetIndexSize(cuann_desc.get(), &ivf_pq_index_size)); + ivf_pq::cuannIvfPqGetIndexSize(cuann_desc.get(), &ivf_pq_index_size); rmm::device_buffer ivf_pq_index_buf_managed(ivf_pq_index_size, stream_, &managed_memory); // Build index @@ -233,7 +231,7 @@ class IvfPqTest : public ::testing::TestWithParam { } else if constexpr (std::is_same_v) { dtype = CUDA_R_32F; } - CUANN_CHECK(ivf_pq::cuannIvfPqBuildIndex( + ivf_pq::cuannIvfPqBuildIndex( handle_, cuann_desc.get(), database.data(), // dataset @@ -244,11 +242,11 @@ class IvfPqTest : public ::testing::TestWithParam { randomRotation, true, // hierarchialClustering: always true in raft ivf_pq_index_buf_managed.data() // memory allocated for the index - )); + ); handle_.sync_stream(stream_); // set search parameters - CUANN_CHECK(ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc.get(), ps.nprobe, ps.k)); + ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc.get(), ps.nprobe, ps.k); // Data type of LUT to be created dynamically at search time. // // The use of low-precision types reduces the amount of shared memory @@ -272,8 +270,8 @@ class IvfPqTest : public ::testing::TestWithParam { // If 0, the thread block size is determined automatically. // uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 - CUANN_CHECK(ivf_pq::cuannIvfPqSetSearchTuningParameters( - cuann_desc.get(), internalDistanceDtype, smemLutDtype, preferredThreadBlockSize)); + ivf_pq::cuannIvfPqSetSearchTuningParameters( + cuann_desc.get(), internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); // Maximum number of query vectors to search at the same time. uint32_t batchSize = std::min(ps.num_queries, 32768); // Maximum device memory size that may be used as workspace at search time. @@ -282,24 +280,24 @@ class IvfPqTest : public ::testing::TestWithParam { // Allocate memory for index size_t ivf_pq_search_workspace_size; - CUANN_CHECK(ivf_pq::cuannIvfPqSearch_bufferSize(handle_, - cuann_desc.get(), - ivf_pq_index_buf_managed.data(), - batchSize, - maxSearchWorkspaceSize, - &ivf_pq_search_workspace_size)); + ivf_pq::cuannIvfPqSearch_bufferSize(handle_, + cuann_desc.get(), + ivf_pq_index_buf_managed.data(), + batchSize, + maxSearchWorkspaceSize, + &ivf_pq_search_workspace_size); rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); // finally, search! - CUANN_CHECK(cuannIvfPqSearch(handle_, - cuann_desc.get(), - ivf_pq_index_buf_managed.data(), - search_queries.data(), - dtype, - ps.num_queries, - indices_ivf_pq_dev.data(), - distances_ivf_pq_dev.data(), - ivf_pq_search_ws_buf.data())); + cuannIvfPqSearch(handle_, + cuann_desc.get(), + ivf_pq_index_buf_managed.data(), + search_queries.data(), + dtype, + ps.num_queries, + indices_ivf_pq_dev.data(), + distances_ivf_pq_dev.data(), + ivf_pq_search_ws_buf.data()); handle_.sync_stream(stream_); update_host(distances_ivf_pq.data(), distances_ivf_pq_dev.data(), queries_size, stream_); From 0bb702f2723903287aad2af480f874e4b210ab57 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 13:14:40 +0200 Subject: [PATCH 007/140] replace cublas calls with raft wrappers --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 171 +++++++++--------------- 1 file changed, 66 insertions(+), 105 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 467cb59c87..028ae41a13 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -19,10 +19,10 @@ #include "detail/ann_utils.cuh" #include +#include #include #include - -#include +#include #include #include @@ -30,7 +30,6 @@ /////////////////// #include #include -#include #include #include @@ -865,24 +864,6 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen } } -// -static cudaStream_t _cuann_set_cublas_stream(cublasHandle_t cublasHandle, cudaStream_t stream) -{ - cublasStatus_t cublasError; - cudaStream_t cublasStream; - cublasError = cublasGetStream(cublasHandle, &cublasStream); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasGetStream() failed.\n", __func__, __LINE__); - exit(-1); - } - cublasError = cublasSetStream(cublasHandle, stream); - if (cublasError != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "(%s, %d) cublasSetStream() failed.\n", __func__, __LINE__); - exit(-1); - } - return cublasStream; -} - // uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset) { @@ -3810,29 +3791,23 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, // // Rotate the residual vectors using a rotation matrix // - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); - float alpha = 1.0; - float beta = 0.0; - RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - dimRotDataset, - clusterSize[l], - dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - dimDataset, - resVectors[devId], - CUDA_R_32F, - dimDataset, - &beta, - rotVectors[devId], - CUDA_R_32F, - dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + dimRotDataset, + clusterSize[l], + dimDataset, + &alpha, + rotationMatrix, + dimDataset, + resVectors[devId], + dimDataset, + &beta, + rotVectors[devId], + dimRotDataset, + handle.get_stream()); // // Training PQ codebook if CUANN_PQ_CENTER_PER_CLUSTER @@ -4495,29 +4470,23 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); // Rotate clusterCenters - cudaStream_t cublasStream = _cuann_set_cublas_stream(handle.get_cublas_handle(), NULL); - float alpha = 1.0; - float beta = 0.0; - RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->dimRotDataset, - desc->numClusters, - desc->dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - desc->dimDataset, - clusterCenters, - CUDA_R_32F, - desc->dimDataset, - &beta, - clusterRotCenters, - CUDA_R_32F, - desc->dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - _cuann_set_cublas_stream(handle.get_cublas_handle(), cublasStream); + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + desc->dimRotDataset, + desc->numClusters, + desc->dimDataset, + &alpha, + rotationMatrix, + desc->dimDataset, + clusterCenters, + desc->dimDataset, + &beta, + clusterRotCenters, + desc->dimRotDataset, + handle.get_stream()); // // Make indexPtr, originalNumbers and pqDataset @@ -5588,48 +5557,40 @@ inline void cuannIvfPqSearch( gemmK = desc->dimDataset + 1; assert(gemmK <= desc->dimDatasetExt); } - RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->numClusters, - nQueries, - gemmK, - &alpha, - clusterCenters, - CUDA_R_32F, - desc->dimDatasetExt, - curQueries, - CUDA_R_32F, - desc->dimDatasetExt, - &beta, - QCDistances, - CUDA_R_32F, - desc->numClusters, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + linalg::gemm(handle, + true, + false, + desc->numClusters, + nQueries, + gemmK, + &alpha, + clusterCenters, + desc->dimDatasetExt, + curQueries, + desc->dimDatasetExt, + &beta, + QCDistances, + desc->numClusters, + handle.get_stream()); // Rotate queries alpha = 1.0; beta = 0.0; - RAFT_CUBLAS_TRY(cublasGemmEx(handle.get_cublas_handle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - desc->dimRotDataset, - nQueries, - desc->dimDataset, - &alpha, - rotationMatrix, - CUDA_R_32F, - desc->dimDataset, - curQueries, - CUDA_R_32F, - desc->dimDatasetExt, - &beta, - rotQueries, - CUDA_R_32F, - desc->dimRotDataset, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + linalg::gemm(handle, + true, + false, + desc->dimRotDataset, + nQueries, + desc->dimDataset, + &alpha, + rotationMatrix, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + &beta, + rotQueries, + desc->dimRotDataset, + handle.get_stream()); // Select neighbor clusters for each query. _cuann_find_topk(handle, From e1cae88a025a30f200ca41bd482d58a37ec2af8d Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 13:32:24 +0200 Subject: [PATCH 008/140] Replace _cuann_memset with utils::memzero --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 31 +++++++------------------ 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 028ae41a13..f13456fb40 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -204,22 +204,6 @@ inline size_t _cuann_aligned(size_t size) return size; } -// memset -inline void _cuann_memset(void* ptr, int value, size_t count) -{ - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, ptr); - if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { - cudaError_t ret = cudaMemset(ptr, value, count); - if (ret != cudaSuccess) { - fprintf(stderr, "(%s) cudaMemset() failed\n", __func__); - exit(-1); - } - } else { - memset(ptr, value, count); - } -} - // argmin along column __global__ void kern_argmin(uint32_t nRows, uint32_t nCols, @@ -822,10 +806,11 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen uint32_t* clusterSize, // [numCenters] float* accumulatedCenters) { + auto stream = rmm::cuda_stream_default; if (accumulatedCenters == NULL) { // accumulate - _cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters); - _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + detail::utils::memzero(centers, numCenters * dimCenters, stream); + detail::utils::memzero(clusterSize, numCenters, stream); if (dtype == CUDA_R_32F) { _cuann_accumulate_with_label( numCenters, dimCenters, centers, clusterSize, numDataset, (const float*)dataset, labels); @@ -953,9 +938,10 @@ inline void _cuann_kmeans_predict(const handle_t& handle, // workspace_core = // (float*)((uint8_t*)bufDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + auto stream = handle.get_stream(); if (tempCenters != NULL && clusterSize != NULL) { - _cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters); - _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + detail::utils::memzero(tempCenters, numCenters * dimCenters, stream); + detail::utils::memzero(clusterSize, numCenters, stream); } cudaMemcpyKind kind; @@ -973,7 +959,6 @@ inline void _cuann_kmeans_predict(const handle_t& handle, RAFT_LOG_DEBUG("_cuann_kmeans_predict: using pool memory resource with initial size %zu bytes", pool_guard->pool_size()); } - auto stream = handle.get_stream(); auto metric = similarity == CUANN_SIMILARITY_INNER ? raft::distance::DistanceType::InnerProduct : raft::distance::DistanceType::L2Expanded; @@ -1160,9 +1145,11 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, cudaDeviceSynchronize(); } cudaSetDevice(orgDevId); + auto stream = handle.get_stream(); if (clusterSize != NULL) { // Reduce results to main thread - _cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters); + detail::utils::memzero(clusterSize, numCenters, stream); + handle.sync_stream(stream); for (int devId = 0; devId < numDevices; devId++) { _cuann_axpy(numCenters, 1, clusterSizeMP[devId], clusterSize); if (devId != orgDevId) { From 5b2c79913e4be85a57e2809c30db6b8f29718d18 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 14:16:56 +0200 Subject: [PATCH 009/140] Use raft logging in most of the places and wrap all cuda calls into RAFT_CUDA_TRY --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 410 ++++-------------------- 1 file changed, 70 insertions(+), 340 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index f13456fb40..d183b8140d 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -204,59 +204,6 @@ inline size_t _cuann_aligned(size_t size) return size; } -// argmin along column -__global__ void kern_argmin(uint32_t nRows, - uint32_t nCols, - const float* a, // [nRows, nCols] - uint32_t* out // [nRows] -) -{ - __shared__ uint32_t smCol[1024]; - __shared__ float smVal[1024]; - uint32_t iRow = blockIdx.x; - if (iRow >= nRows) return; - uint32_t iCol = threadIdx.x; - uint32_t minCol = nCols; - float minVal = FLT_MAX; - for (iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { - if (minVal > a[iCol + (nCols * iRow)]) { - minVal = a[iCol + (nCols * iRow)]; - minCol = iCol; - } - } - smVal[threadIdx.x] = minVal; - smCol[threadIdx.x] = minCol; - __syncthreads(); - for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) { - if (threadIdx.x < offset) { - if (smVal[threadIdx.x] < smVal[threadIdx.x + offset]) { - } else if (smVal[threadIdx.x] > smVal[threadIdx.x + offset]) { - smVal[threadIdx.x] = smVal[threadIdx.x + offset]; - smCol[threadIdx.x] = smCol[threadIdx.x + offset]; - } else if (smCol[threadIdx.x] > smCol[threadIdx.x + offset]) { - smCol[threadIdx.x] = smCol[threadIdx.x + offset]; - } - } - __syncthreads(); - } - if (threadIdx.x == 0) { out[iRow] = smCol[0]; } -} - -// argmin along column -inline void _cuann_argmin(uint32_t nRows, - uint32_t nCols, - const float* a, // [nRows, nCols] - uint32_t* out // [nRows] -) -{ - uint32_t nThreads = 1024; - while (nThreads > nCols) { - nThreads /= 2; - } - nThreads = max(nThreads, 128); - kern_argmin<<>>(nRows, nCols, a, out); -} - // copy template __global__ void kern_copy(uint32_t nRows, @@ -713,50 +660,18 @@ T** _cuann_multi_device_malloc(int numDevices, // otherwise, cudaMallocManaged() used. ) { - cudaError_t cudaError; int orgDevId; - cudaError = cudaGetDevice(&orgDevId); - if (cudaError != cudaSuccess) { - fprintf( - stderr, "(%s, %d) cudaGetDevice() failed (arrayName: %s).\n", __func__, __LINE__, arrayName); - exit(-1); - } + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); T** arrays = (T**)malloc(sizeof(T*) * numDevices); for (int devId = 0; devId < numDevices; devId++) { - cudaError = cudaSetDevice(devId); - if (cudaError != cudaSuccess) { - fprintf(stderr, - "(%s, %d) cudaSetDevice() failed (arrayName: %s).\n", - __func__, - __LINE__, - arrayName); - exit(-1); - } + RAFT_CUDA_TRY(cudaSetDevice(devId)); if (useCudaMalloc) { - cudaError = cudaMalloc(&(arrays[devId]), sizeof(T) * numArrayElements); - if (cudaError != cudaSuccess) { - fprintf( - stderr, "(%s, %d) cudaMalloc() failed (arrayName: %s).\n", __func__, __LINE__, arrayName); - exit(-1); - } + RAFT_CUDA_TRY(cudaMalloc(&(arrays[devId]), sizeof(T) * numArrayElements)); } else { - cudaError = cudaMallocManaged(&(arrays[devId]), sizeof(T) * numArrayElements); - if (cudaError != cudaSuccess) { - fprintf(stderr, - "(%s, %d) cudaMallocManaged() failed (arrayName: %s).\n", - __func__, - __LINE__, - arrayName); - exit(-1); - } + RAFT_CUDA_TRY(cudaMallocManaged(&(arrays[devId]), sizeof(T) * numArrayElements)); } } - cudaError = cudaSetDevice(orgDevId); - if (cudaError != cudaSuccess) { - fprintf( - stderr, "(%s, %d) cudaSetDevice() failed (arrayName: %s)\n", __func__, __LINE__, arrayName); - exit(-1); - } + RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); return arrays; } @@ -919,16 +834,11 @@ inline void _cuann_kmeans_predict(const handle_t& handle, return; } - cudaError_t cudaError; uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); void* workspace = _workspace; if (_workspace == NULL) { size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); - cudaError = cudaMallocManaged(&workspace, sizeWorkspace); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - exit(-1); - } + RAFT_CUDA_TRY(cudaMallocManaged(&workspace, sizeWorkspace)); } float* curDataset; // [chunk, dimCenters] void* bufDataset; // [chunk, dimCenters] @@ -966,30 +876,21 @@ inline void _cuann_kmeans_predict(const handle_t& handle, uint64_t ie = min(is + chunk, (uint64_t)numDataset); uint32_t nDataset = ie - is; - // RAFT_LOG_INFO( - // "_cuann_kmeans_predict(dimCenters = %u, nDataset = %u, is = %zu)", dimCenters, nDataset, - // is); if (dtype == CUDA_R_32F) { - // TODO: CRASH: Program hit cudaErrorIllegalAddress (error 700) due to "an illegal memory - // access was encountered" on CUDA API call to cudaMemcpyAsync_ptsz. - cudaError = cudaMemcpy(bufDataset, - (float*)dataset + (is * dimCenters), - sizeof(float) * nDataset * dimCenters, - kind); + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (float*)dataset + (is * dimCenters), + sizeof(float) * nDataset * dimCenters, + kind)); } else if (dtype == CUDA_R_8U) { - cudaError = cudaMemcpy(bufDataset, - (uint8_t*)dataset + (is * dimCenters), - sizeof(uint8_t) * nDataset * dimCenters, - kind); + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (uint8_t*)dataset + (is * dimCenters), + sizeof(uint8_t) * nDataset * dimCenters, + kind)); } else if (dtype == CUDA_R_8I) { - cudaError = cudaMemcpy(bufDataset, - (int8_t*)dataset + (is * dimCenters), - sizeof(int8_t) * nDataset * dimCenters, - kind); - } - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMemcpy() failed.\n", __func__, __LINE__); - exit(-1); + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (int8_t*)dataset + (is * dimCenters), + sizeof(int8_t) * nDataset * dimCenters, + kind)); } if (dtype == CUDA_R_32F) { @@ -1040,15 +941,6 @@ inline void _cuann_kmeans_predict(const handle_t& handle, _cuann_accumulate_with_label( numCenters, dimCenters, tempCenters, clusterSize, nDataset, curDataset, labels + is); } -#if 0 - // debug - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", - __func__, __LINE__); - exit(-1); - } -#endif } if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) { @@ -1294,10 +1186,9 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] float threshold, void* ws) { - if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) { - fprintf(stderr, "(%s, %d) Unsupported dtype (%d)\n", __func__, __LINE__, dtype); - exit(-1); - } + RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, + "Unsupported dtype (%d)", + dtype); bool adjusted = false; static uint32_t iPrimes = 0; constexpr uint32_t numPrimes = 40; @@ -1396,14 +1287,8 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] } if (count > 0) { adjusted = true; -#ifdef CUANN_DEBUG - fprintf(stderr, - "(%s) num adjusted: %u / %u, threshold: %d \n", - __func__, - count, - numCenters, - (int)(average * threshold)); -#endif + RAFT_LOG_DEBUG( + "num adjusted: %u / %u, threshold: %d \n", count, numCenters, (int)(average * threshold)); } } return adjusted; @@ -1746,18 +1631,6 @@ __launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ } } } - -#ifdef CUANN_DEBUG - cg::sync(grid); - if (thread_id == 0 && count[0] < topk) { - printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", - i_batch, - topk, - count[0], - count_below, - threshold); - } -#endif } // @@ -1979,18 +1852,6 @@ __launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ } } } - -#ifdef CUANN_DEBUG - __syncthreads(); - if (thread_id == 0 && count[0] < topk) { - printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", - i_batch, - topk, - count[0], - count_below, - threshold); - } -#endif } // @@ -2225,18 +2086,6 @@ __launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ } } } - -#ifdef CUANN_DEBUG - cg::sync(grid); - if (thread_id == 0 && count[0] < topk) { - printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", - i_batch, - topk, - count[0], - count_below, - threshold); - } -#endif } // @@ -2383,18 +2232,6 @@ __launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ } } } - -#ifdef CUANN_DEBUG - __syncthreads(); - if (thread_id == 0 && count[0] < topk) { - printf("# i_batch:%d, topk:%d, count[0]:%d, count_below:%d, threshold:%08x\n", - i_batch, - topk, - count[0], - count_below, - threshold); - } -#endif } // @@ -3429,25 +3266,9 @@ inline void _cuann_get_inclusiveSumSortedClusterSize( desc->_numClustersSize0 += 1; // Work-around for clusters of size 0 -#if 0 - printf("# i:%d, %u ... ", i, (*output)[i]); - for (int j = 0; j < desc->dimDatasetExt; j++) { - printf( "%.3f, ", clusterCenters[ j + (desc->dimDatasetExt * i) ] ); - } - printf( "\n" ); -#endif _cuann_get_random_norm_vector(desc->dimDatasetExt, clusterCenters + (desc->dimDatasetExt * i)); -#if 0 - printf("# i:%d, %u ... ", i, (*output)[i]); - for (int j = 0; j < desc->dimDatasetExt; j++) { - printf( "%.3f, ", clusterCenters[ j + (desc->dimDatasetExt * i) ] ); - } - printf( "\n" ); -#endif - } - if (1 || desc->_numClustersSize0 > 0) { - fprintf(stderr, "# num clusters of size 0: %d\n", desc->_numClustersSize0); } + RAFT_LOG_DEBUG("Number of clusters of size zero: %d", desc->_numClustersSize0); // sort qsort(*output, desc->numClusters, sizeof(uint32_t), descending); // scan @@ -3462,13 +3283,8 @@ inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, float** output // [numClusters,] ) { - cudaError_t cudaError; if (*output != NULL) { cudaFree(*output); } - cudaError = cudaMallocManaged(output, sizeof(float) * desc->numClusters); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaMallocManaged() failed.\n", __func__, __LINE__); - exit(-1); - } + RAFT_CUDA_TRY(cudaMallocManaged(output, sizeof(float) * desc->numClusters)); switch (detail::utils::check_pointer_residency(clusterCenters, *output)) { case detail::utils::pointer_residency::device_only: case detail::utils::pointer_residency::host_and_device: break; @@ -3521,7 +3337,7 @@ inline void _cuann_make_rotation_matrix(uint32_t nRows, assert(nRows % lenPq == 0); if (randomRotation) { - fprintf(stderr, "# create rotation matrix randomly.\n"); + RAFT_LOG_DEBUG("Creating a random rotation matrix."); double dot, norm; double* matrix = (double*)malloc(sizeof(double) * nRows * nCols); memset(matrix, 0, sizeof(double) * nRows * nCols); @@ -3643,16 +3459,8 @@ inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq int _cuann_set_device(int devId) { int orgDevId; - cudaError_t cudaError = cudaGetDevice(&orgDevId); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaGetDevice() failed (%d)\n", __func__, __LINE__, cudaError); - exit(-1); - } - cudaError = cudaSetDevice(devId); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaSetDevice() failed (%d)\n", __func__, __LINE__, cudaError); - exit(-1); - } + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); + RAFT_CUDA_TRY(cudaSetDevice(devId)); return orgDevId; } @@ -4026,10 +3834,7 @@ inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); *size = sizeof(struct cuannIvfPqIndexHeader); - if (*size != 1024) { - fprintf(stderr, "(%s, %d) Unexpected Error!\n", __func__, __LINE__); - exit(-1); - } + RAFT_EXPECTS(*size == 1024, "Critical error: unexpected header size."); *size += _cuann_getIndexSize_clusterCenters(desc); *size += _cuann_getIndexSize_pqCenters(desc); *size += _cuann_getIndexSize_pqDataset(desc); @@ -4063,7 +3868,8 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, desc->dtypeDataset = dtype; char dtypeString[64]; - fprintf(stderr, "# dtypeDataset: %s\n", _cuann_get_dtype_string(desc->dtypeDataset, dtypeString)); + _cuann_get_dtype_string(desc->dtypeDataset, dtypeString); + RAFT_LOG_DEBUG("Dataset dtype = %s", dtypeString); switch (detail::utils::check_pointer_residency(dataset, trainset)) { case detail::utils::pointer_residency::host_only: @@ -4106,11 +3912,15 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, // // Training kmeans // - fprintf(stderr, "# hierarchicalClustering: %u\n", hierarchicalClustering); + if (hierarchicalClustering) { + RAFT_LOG_DEBUG("Hierarchical clustering: enabled"); + } else { + RAFT_LOG_DEBUG("Hierarchical clustering: disabled"); + } if (hierarchicalClustering) { // Hierarchical kmeans uint32_t numMesoClusters = pow((double)(desc->numClusters), (double)1.0 / 2.0) + 0.5; - fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters); + RAFT_LOG_DEBUG("numMesoClusters: %u", numMesoClusters); float* mesoClusterCenters; // [numMesoClusters, dimDataset] RAFT_CUDA_TRY( @@ -4131,12 +3941,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, // int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { - fprintf(stderr, - "(%s) " - "Training kmeans for meso-clusters: %.1f / %u \r", - __func__, - (float)iter / 2, - numIterations); _cuann_kmeans_predict(handle, mesoClusterCenters, numMesoClusters, @@ -4165,7 +3969,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= 1; } } - fprintf(stderr, "\n"); cudaDeviceSynchronize(); // Number of centers in each meso cluster @@ -4276,16 +4079,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, } int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { - if (devId == 0) { - fprintf(stderr, - "(%s) Training kmeans for clusters in " - "meso-cluster %u (numClusters: %u): %.1f / %u \r", - __func__, - i, - numFineClusters[i], - (float)iter / 2, - numIterations); - } _cuann_kmeans_predict(handle, clusterCentersEach[devId], numFineClusters[i], @@ -4323,7 +4116,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, cudaSetDevice(devId); cudaDeviceSynchronize(); } - fprintf(stderr, "\n"); cudaSetDevice(cuannDevId); _cuann_multi_device_free(idsTrainset, 1); @@ -4354,12 +4146,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, const int X = 5; int numIterations_X = max(numIterations / 10, 2) * X; for (int iter = 0; iter < numIterations_X; iter += X) { - fprintf(stderr, - "(%s) " - "Fine-tuning kmeans for whole clusters: %.1f / %d \r", - __func__, - (float)iter / X, - numIterations_X / X); _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, @@ -4386,13 +4172,10 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= (X - 1); } } - fprintf(stderr, "\n"); } else { // Flat kmeans int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { - fprintf( - stderr, "(%s) Training kmeans: %.1f / %u \r", __func__, (float)iter / 2, numIterations); _cuann_kmeans_predict(handle, clusterCenters, desc->numClusters, @@ -4421,7 +4204,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= 1; } } - fprintf(stderr, "\n"); } uint32_t* datasetLabels; // [numDataset] @@ -4430,7 +4212,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, // // Predict labels of whole dataset (with multiple GPUs) // - fprintf(stderr, "(%s) Final fitting\n", __func__); _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, @@ -4450,9 +4231,9 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, #endif // Make rotation matrix - fprintf(stderr, "# dimDataset: %u\n", desc->dimDataset); - fprintf(stderr, "# dimRotDataset: %u\n", desc->dimRotDataset); - fprintf(stderr, "# randomRotation: %u\n", randomRotation); + RAFT_LOG_DEBUG("# dimDataset: %u\n", desc->dimDataset); + RAFT_LOG_DEBUG("# dimRotDataset: %u\n", desc->dimRotDataset); + RAFT_LOG_DEBUG("# randomRotation: %s\n", randomRotation ? "enabled" : "disabled"); _cuann_make_rotation_matrix( desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); @@ -4487,7 +4268,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, } RAFT_EXPECTS(indexPtr[desc->numClusters] == desc->numDataset, "Cluster sizes do not add up"); desc->maxClusterSize = maxClusterSize; - // fprintf(stderr, "(%s) maxClusterSize: %u\n", __func__, maxClusterSize); // originalNumbers for (uint32_t i = 0; i < desc->numDataset; i++) { @@ -4525,7 +4305,6 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, // // Predict label of trainset again (with multiple GPUs) - fprintf(stderr, "(%s) Predict label of trainset again\n", __func__); _cuann_kmeans_predict_MP(handle, clusterCenters, desc->numClusters, @@ -4758,7 +4537,7 @@ inline void cuannIvfPqSaveIndex(const handle_t& handle, RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)index; - fprintf(stderr, "(%s) indexSize: %lu\n", __func__, header->indexSize); + RAFT_LOG_DEBUG("indexSize: %lu\n", header->indexSize); if (fwrite(index, 1, header->indexSize, fp) != header->indexSize) { RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); } @@ -4782,7 +4561,7 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, size_t indexSize; fread(&indexSize, sizeof(size_t), 1, fp); - fprintf(stderr, "(%s) indexSize: %lu\n", __func__, indexSize); + RAFT_LOG_DEBUG("indexSize: %lu\n", indexSize); RAFT_CUDA_TRY(cudaMallocManaged(index, indexSize)); fseek(fp, 0, SEEK_SET); if (fread(*index, 1, indexSize, fp) != indexSize) { @@ -4892,8 +4671,9 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( cuannIvfPqLoadIndex(handle, oldDesc, &oldIndex, oldIndexFileName); cudaDataType_t dtype = oldDesc->dtypeDataset; char dtypeString[64]; - fprintf(stderr, "(%s) dtype: %s\n", __func__, _cuann_get_dtype_string(dtype, dtypeString)); - fprintf(stderr, "(%s) dimDataset: %u\n", __func__, oldDesc->dimDataset); + _cuann_get_dtype_string(dtype, dtypeString); + RAFT_LOG_DEBUG("dtype: %s", dtypeString); + RAFT_LOG_DEBUG("dimDataset: %u", oldDesc->dimDataset); struct cuannIvfPqIndexHeader* oldHeader; float* oldClusterCenters; // [numClusters, dimDatasetExt] float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or @@ -4938,7 +4718,6 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( uint32_t* clusterSize; // [numClusters,] RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters)); cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters); - fprintf(stderr, "(%s) Predict label of new vectors\n", __func__); _cuann_kmeans_predict_MP(handle, clusterCenters, oldDesc->numClusters, @@ -5045,15 +4824,14 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( cuannIvfPqCreateDescriptor(&newDesc); memcpy(newDesc, oldDesc, sizeof(struct cuannIvfPqDescriptor)); newDesc->numDataset += numNewVectors; - fprintf( - stderr, "(%s) numDataset: %u -> %u\n", __func__, oldDesc->numDataset, newDesc->numDataset); + RAFT_LOG_DEBUG("numDataset: %u -> %u", oldDesc->numDataset, newDesc->numDataset); // // Allocate memory for new index // size_t newIndexSize; cuannIvfPqGetIndexSize(newDesc, &newIndexSize); - fprintf(stderr, "(%s) indexSize: %lu -> %lu\n", __func__, oldHeader->indexSize, newIndexSize); + RAFT_LOG_DEBUG("indexSize: %lu -> %lu", oldHeader->indexSize, newIndexSize); void* newIndex = malloc(newIndexSize); memset(newIndex, 0, newIndexSize); struct cuannIvfPqIndexHeader* newHeader; @@ -5107,11 +4885,7 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( newDesc->maxClusterSize = maxClusterSize; newHeader->maxClusterSize = maxClusterSize; } - fprintf(stderr, - "(%s) maxClusterSize: %u -> %u\n", - __func__, - oldDesc->maxClusterSize, - newDesc->maxClusterSize); + RAFT_LOG_DEBUG("maxClusterSize: %u -> %u", oldDesc->maxClusterSize, newDesc->maxClusterSize); // // Make newOriginalNumbers @@ -5148,14 +4922,13 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // cuannIvfPqSaveIndex(handle, newDesc, newIndex, newIndexFileName); if (newHeader->numDatasetAdded * 2 >= newHeader->numDataset) { - fprintf(stderr, - "(%s) The total number of vectors in the new index" - " is now more than twice the initial number of vectors." - " You may want to re-build the index from scratch." - " (numVectors: %u, numVectorsAdded: %u)\n", - __func__, - newHeader->numDataset, - newHeader->numDatasetAdded); + RAFT_LOG_INFO( + "The total number of vectors in the new index" + " is now more than twice the initial number of vectors." + " You may want to re-build the index from scratch." + " (numVectors: %u, numVectorsAdded: %u)", + newHeader->numDataset, + newHeader->numDatasetAdded); } // @@ -5208,19 +4981,13 @@ inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, numProbes, topK, numSamplesWorstCase); - desc->numProbes = numProbes; - desc->topK = topK; - if (0) { - char dtypeString[64]; - fprintf( - stderr, "# dtypeDataset: %s\n", _cuann_get_dtype_string(desc->dtypeDataset, dtypeString)); - } + desc->numProbes = numProbes; + desc->topK = topK; desc->maxSamples = desc->inclusiveSumSortedClusterSize[numProbes - 1]; if (desc->maxSamples % 128) { desc->maxSamples += 128 - (desc->maxSamples % 128); } desc->internalDistanceDtype = CUDA_R_32F; desc->smemLutDtype = CUDA_R_32F; desc->preferredThreadBlockSize = 0; - // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); } // cuannIvfPqSetSearchParameters @@ -5239,16 +5006,9 @@ inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, preferredThreadBlockSize == 1024 || preferredThreadBlockSize == 0, "preferredThreadBlockSize must be 0, 256, 512 or 1024, but %u is given.", preferredThreadBlockSize); - desc->internalDistanceDtype = internalDistanceDtype; - desc->smemLutDtype = smemLutDtype; - if (0) { - char dtypeString[64]; - fprintf(stderr, - "# internalDistanceDtype: %s\n", - _cuann_get_dtype_string(desc->internalDistanceDtype, dtypeString)); - } + desc->internalDistanceDtype = internalDistanceDtype; + desc->smemLutDtype = smemLutDtype; desc->preferredThreadBlockSize = preferredThreadBlockSize; - // fprintf(stderr, "# maxSample: %u\n", desc->inclusiveSumSortedClusterSize[0]); } // cuannIvfPqGetSearchParameters @@ -5300,7 +5060,6 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, if (size_0 > max_ws) { maxQueries = maxQueries * max_ws / size_0; if (maxQueries > 32) { maxQueries -= (maxQueries % 32); } - // fprintf(stderr, "(%s) maxQueries is reduced to %u.\n", __func__, maxQueries); } // maxQueries = min(max(maxQueries, 1), 1024); // maxQueries = min(max(maxQueries, 1), 2048); @@ -5337,9 +5096,6 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, uint32_t numCta_perBatch_1 = numCta_perBatch + 1; uint32_t maxBatchSize_1 = numCta_total / numCta_perBatch_1; float utilization_1 = (float)numCta_perBatch_1 * maxBatchSize_1 / numCta_total; - // fprintf(stderr, "# maxBatchSize :%u, utilization :%f\n", desc->maxBatchSize, - // utilization); fprintf(stderr, "# maxBatchSize_1:%u, utilization_1:%f\n", maxBatchSize_1, - // utilization_1); if (utilization < utilization_1) { desc->maxBatchSize = maxBatchSize_1; } } } @@ -5350,14 +5106,10 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, size_t size_2 = ivfpq_search_bufferSize(handle, desc); *workspaceSize += max(size_1, size_2); -#ifdef CUANN_DEBUG - fprintf(stderr, "# maxQueries: %u\n", maxQueries); - fprintf(stderr, "# maxBatchSize: %u\n", desc->maxBatchSize); - fprintf(stderr, - "# workspaceSize: %lu (%.3f GiB)\n", - *workspaceSize, - (float)*workspaceSize / 1024 / 1024 / 1024); -#endif + RAFT_LOG_TRACE("maxQueries: %u", maxQueries); + RAFT_LOG_TRACE("maxBatchSize: %u", desc->maxBatchSize); + RAFT_LOG_DEBUG( + "workspaceSize: %lu (%.3f GiB)", *workspaceSize, (float)*workspaceSize / 1024 / 1024 / 1024); } // cuannIvfPqSearch @@ -6457,8 +6209,6 @@ inline void ivfpq_search(const handle_t& handle, float* preCompScores = NULL; void* topkWorkspace; - cudaError_t cudaError; - clusterLabelsOut = (uint32_t*)workspace; indexList = (uint32_t*)((uint8_t*)clusterLabelsOut + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); @@ -6639,7 +6389,6 @@ inline void ivfpq_search(const handle_t& handle, } else { numThreads = desc->preferredThreadBlockSize; } - // printf("# numThreads: %d\n", numThreads); size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); @@ -6647,7 +6396,7 @@ inline void ivfpq_search(const handle_t& handle, bool kernel_no_basediff_available = true; if (sizeSmem > thresholdSmem) { - cudaError = cudaFuncSetAttribute( + cudaError_t cudaError = cudaFuncSetAttribute( kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem); if (cudaError != cudaSuccess) { RAFT_EXPECTS( @@ -6666,7 +6415,7 @@ inline void ivfpq_search(const handle_t& handle, if (kernel_no_basediff_available) { bool kernel_fast_available = true; if (sizeSmem + sizeSmemBaseDiff > thresholdSmem) { - cudaError = cudaFuncSetAttribute( + cudaError_t cudaError = cudaFuncSetAttribute( kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem + sizeSmemBaseDiff); if (cudaError != cudaSuccess) { RAFT_EXPECTS( @@ -6675,29 +6424,14 @@ inline void ivfpq_search(const handle_t& handle, kernel_fast_available = false; } } -#if 0 - fprintf( stderr, - "# sizeSmem: %lu, sizeSmemBaseDiff: %lu, kernel_fast_available: %d\n", - sizeSmem, sizeSmemBaseDiff, kernel_fast_available ); -#endif if (kernel_fast_available) { int numBlocks_kernel_no_basediff = 0; - cudaError = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks_kernel_no_basediff, kernel_no_basediff, numThreads, sizeSmem); - // fprintf(stderr, "# numBlocks_kernel_no_basediff: %d\n", numBlocks_kernel_no_basediff); - if (cudaError != cudaSuccess) { - fprintf(stderr, "cudaOccupancyMaxActiveBlocksPerMultiprocessor() failed\n"); - exit(-1); - } + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_no_basediff, kernel_no_basediff, numThreads, sizeSmem)); int numBlocks_kernel_fast = 0; - cudaError = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks_kernel_fast, kernel_fast, numThreads, sizeSmem + sizeSmemBaseDiff); - // fprintf(stderr, "# numBlocks_kernel_fast: %d\n", numBlocks_kernel_fast); - if (cudaError != cudaSuccess) { - fprintf(stderr, "cudaOccupancyMaxActiveBlocksPerMultiprocessor() failed\n"); - exit(-1); - } + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_fast, kernel_fast, numThreads, sizeSmem + sizeSmemBaseDiff)); // Use "kernel_fast" only if GPU occupancy does not drop if (numBlocks_kernel_no_basediff == numBlocks_kernel_fast) { @@ -6754,11 +6488,7 @@ inline void ivfpq_search(const handle_t& handle, topkWorkspace); } #ifdef CUANN_DEBUG - cudaError = cudaDeviceSynchronize(); - if (cudaError != cudaSuccess) { - fprintf(stderr, "(%s, %d) cudaDeviceSynchronize() failed.\n", __func__, __LINE__); - exit(-1); - } + handle.sync_stream(); #endif // From 4f64cd30311ba0c96d7eb1c15e01d6a203d6b0b6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 14:34:29 +0200 Subject: [PATCH 010/140] Use helper accumulate_into_selected --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 90 +++---------------------- 1 file changed, 8 insertions(+), 82 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index d183b8140d..643dbda208 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -454,65 +454,6 @@ inline void _cuann_a_me_b(uint32_t nRows, kern_a_me_b<<>>(nRows, nCols, a, ldA, b); } -// accumulate -template -__global__ void kern_accumulate_with_label(uint32_t nRowsOutput, - uint32_t nCols, - float* output, // [nRowsOutput, nCols,] - uint32_t* count, // [nRowsOutput,] - uint32_t nRowsInput, - const T* input, // [nRowsInput, nCols,] - const uint32_t* label, // [nRowsInput,] - float divisor) -{ - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t iCol = gid % nCols; - uint64_t iRowInput = gid / nCols; - if (iRowInput >= nRowsInput) return; - uint64_t iRowOutput = label[iRowInput]; - if (iCol == 0) { atomicAdd(&(count[iRowOutput]), 1); } - atomicAdd(&(output[iCol + (nCols * iRowOutput)]), input[gid] / divisor); -} - -// accumulate -template -inline void _cuann_accumulate_with_label(uint32_t nRowsOutput, - uint32_t nCols, - float* output, // [nRowsOutput, nCols,] - uint32_t* count, // [nRowsOutput,] - uint32_t nRowsInput, - const T* input, // [nRowsInput, nCols,] - const uint32_t* label, // [nRowsInput,] - float divisor = 1.0) -{ - bool useGPU = 1; - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, output); - if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } - cudaPointerGetAttributes(&attr, count); - if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } - cudaPointerGetAttributes(&attr, input); - if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; } - - if (useGPU) { - // GPU - uint32_t nThreads = 128; - uint64_t nBlocks = (((uint64_t)nRowsInput * nCols) + nThreads - 1) / nThreads; - kern_accumulate_with_label - <<>>(nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor); - } else { - // CPU - cudaDeviceSynchronize(); - for (uint64_t i = 0; i < nRowsInput; i++) { - uint64_t l = label[i]; - count[l] += 1; - for (uint64_t j = 0; j < nCols; j++) { - output[j + (nCols * l)] += input[j + (nCols * i)] / divisor; - } - } - } -} - // normalize __global__ void kern_normalize(uint32_t nRows, uint32_t nCols, @@ -727,28 +668,14 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen detail::utils::memzero(centers, numCenters * dimCenters, stream); detail::utils::memzero(clusterSize, numCenters, stream); if (dtype == CUDA_R_32F) { - _cuann_accumulate_with_label( - numCenters, dimCenters, centers, clusterSize, numDataset, (const float*)dataset, labels); + detail::utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const float*)dataset, labels, stream); } else if (dtype == CUDA_R_8U) { - float divisor = 256.0; - _cuann_accumulate_with_label(numCenters, - dimCenters, - centers, - clusterSize, - numDataset, - (const uint8_t*)dataset, - labels, - divisor); + detail::utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const uint8_t*)dataset, labels, stream); } else if (dtype == CUDA_R_8I) { - float divisor = 128.0; - _cuann_accumulate_with_label(numCenters, - dimCenters, - centers, - clusterSize, - numDataset, - (const int8_t*)dataset, - labels, - divisor); + detail::utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const int8_t*)dataset, labels, stream); } } else { cudaMemcpy( @@ -938,8 +865,8 @@ inline void _cuann_kmeans_predict(const handle_t& handle, if ((tempCenters != NULL) && (clusterSize != NULL)) { // accumulate - _cuann_accumulate_with_label( - numCenters, dimCenters, tempCenters, clusterSize, nDataset, curDataset, labels + is); + detail::utils::accumulate_into_selected( + nDataset, dimCenters, tempCenters, clusterSize, curDataset, labels + is, stream); } } @@ -1318,7 +1245,6 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] #define NUM_THREADS 1024 // DO NOT CHANGE #define STATE_BIT_LENGTH 8 // 0: state not used, 8: state used #define MAX_VEC_LENGTH 8 // 1, 2, 4 or 8 -// #define CUANN_DEBUG // __device__ inline uint32_t convert(uint32_t x) From 0a7324b4ded352a6e87fae107980a12460898a6a Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 14:59:36 +0200 Subject: [PATCH 011/140] Use helper copy_selected --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 356 +++++++++--------------- 1 file changed, 132 insertions(+), 224 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 643dbda208..4a7a924fbe 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -196,14 +197,6 @@ inline char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) return string; } -// -inline size_t _cuann_aligned(size_t size) -{ - size_t unit = 128; - if (size % unit) { size += unit - (size % unit); } - return size; -} - // copy template __global__ void kern_copy(uint32_t nRows, @@ -265,25 +258,6 @@ template void _cuann_copy(uint32_t nRows, uint32_t ldDst, float divisor); -// copy_CPU -template -inline void _cuann_copy_CPU(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst) -{ - for (uint32_t ir = 0; ir < nRows; ir++) { - for (uint32_t ic = 0; ic < nCols; ic++) { - dst[ic + (ldDst * ir)] = src[ic + (ldSrc * ir)]; - } - } -} - -template void _cuann_copy_CPU( - uint32_t nRows, uint32_t nCols, const float* src, uint32_t ldSrc, float* dst, uint32_t ldDst); - // copy_fill template __global__ void kern_copy_fill(uint32_t nRows, @@ -354,78 +328,6 @@ template void _cuann_copy_fill(uint32_t nRows, float divisor, cudaStream_t stream); -// copy with row list -template -__global__ void kern_copy_with_list(uint32_t nRows, - uint32_t nCols, - const T* src, // [..., ldSrc] - const uint32_t* rowList, // [nRows,] - uint32_t ldSrc, - float* dst, // [nRows, ldDst] - uint32_t ldDst, - float divisor) -{ - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t iCol = gid % nCols; - uint64_t iRow = gid / nCols; - if (iRow >= nRows) return; - uint64_t iaRow = rowList[iRow]; - dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor; -} - -// copy with row list -template -inline void _cuann_copy_with_list(uint32_t nRows, - uint32_t nCols, - const T* src, // [..., ldSrc] - const uint32_t* rowList, // [nRows,] - uint32_t ldSrc, - float* dst, // [nRows, ldDst] - uint32_t ldDst, - float divisor = 1.0f) -{ - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, src); - if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { - for (uint64_t iRow = 0; iRow < nRows; iRow++) { - uint64_t iaRow = rowList[iRow]; - for (uint64_t iCol = 0; iCol < nCols; iCol++) { - dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor; - } - } - } else { - uint32_t nThreads = 128; - uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; - kern_copy_with_list - <<>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor); - } -} - -template void _cuann_copy_with_list(uint32_t nRows, - uint32_t nCols, - const float* src, - const uint32_t* rowList, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); -template void _cuann_copy_with_list(uint32_t nRows, - uint32_t nCols, - const uint8_t* src, - const uint32_t* rowList, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); -template void _cuann_copy_with_list(uint32_t nRows, - uint32_t nCols, - const int8_t* src, - const uint32_t* rowList, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); - // a -= b __global__ void kern_a_me_b(uint32_t nRows, uint32_t nCols, @@ -716,11 +618,11 @@ inline size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); size_t size = 0; // float *curDataset; // [chunk, dimCenters] - size += _cuann_aligned(sizeof(float) * chunk * dimCenters); + size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); // void *bufDataset; // [chunk, dimCenters] - size += _cuann_aligned(sizeof(float) * chunk * dimCenters); + size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); // float *workspace; - size += _cuann_aligned(sizeof(float) * (numCenters + chunk + (numCenters * chunk))); + size += Pow2<128>::roundUp(sizeof(float) * (numCenters + chunk + (numCenters * chunk))); return size; } @@ -771,9 +673,10 @@ inline void _cuann_kmeans_predict(const handle_t& handle, void* bufDataset; // [chunk, dimCenters] // float* workspace_core; curDataset = (float*)workspace; - bufDataset = (void*)((uint8_t*)curDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + bufDataset = + (void*)((uint8_t*)curDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); // workspace_core = - // (float*)((uint8_t*)bufDataset + _cuann_aligned(sizeof(float) * chunk * dimCenters)); + // (float*)((uint8_t*)bufDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); auto stream = handle.get_stream(); if (tempCenters != NULL && clusterSize != NULL) { @@ -2194,9 +2097,9 @@ inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, size_t workspaceSize = 0; // count if (sampleDtype == CUDA_R_16F) { - workspaceSize += _cuann_aligned(sizeof(uint32_t) * sizeBatch * 2 * 256); + workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); } else { - workspaceSize += _cuann_aligned(sizeof(uint32_t) * sizeBatch * 5 * 1024); + workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); } // state if (stateBitLen == 8) { @@ -2207,16 +2110,16 @@ inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, uint32_t numSample_perThread = (maxSamples + numThreads_perBatch - 1) / numThreads_perBatch; uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; workspaceSize += - _cuann_aligned(sizeof(uint8_t) * numState_perThread * numThreads_perBatch * sizeBatch); + Pow2<128>::roundUp(sizeof(uint8_t) * numState_perThread * numThreads_perBatch * sizeBatch); } size_t workspaceSize2 = 0; // offsets - workspaceSize2 += _cuann_aligned(sizeof(int) * (sizeBatch + 1)); + workspaceSize2 += Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1)); // keys_in, keys_out, values_out - workspaceSize2 += _cuann_aligned(sizeof(float) * sizeBatch * topK); - workspaceSize2 += _cuann_aligned(sizeof(float) * sizeBatch * topK); - workspaceSize2 += _cuann_aligned(sizeof(uint32_t) * sizeBatch * topK); + workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); + workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); + workspaceSize2 += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK); // cub_ws size_t cub_ws_size = 0; cub::DeviceSegmentedRadixSort::SortPairs(NULL, @@ -2229,7 +2132,7 @@ inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, sizeBatch, (int*)NULL, (int*)NULL); - workspaceSize2 += _cuann_aligned(cub_ws_size); + workspaceSize2 += Pow2<128>::roundUp(cub_ws_size); workspaceSize = max(workspaceSize, workspaceSize2); return workspaceSize; @@ -2293,7 +2196,7 @@ inline void _cuann_find_topk(const handle_t& handle, uint32_t* count = (uint32_t*)workspace; uint8_t* state = NULL; if (stateBitLen == 8) { - state = (uint8_t*)count + _cuann_aligned(sizeof(uint32_t) * sizeBatch * 5 * 1024); + state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); } dim3 threads(numThreads, 1, 1); @@ -2330,13 +2233,14 @@ inline void _cuann_find_topk(const handle_t& handle, // offsets: [sizeBatch + 1] // keys_in, keys_out, values_out: [sizeBatch, topK] - int* offsets = (int*)workspace; - float* keys_in = (float*)((uint8_t*)offsets + _cuann_aligned(sizeof(int) * (sizeBatch + 1))); - float* keys_out = (float*)((uint8_t*)keys_in + _cuann_aligned(sizeof(float) * sizeBatch * topK)); + int* offsets = (int*)workspace; + float* keys_in = (float*)((uint8_t*)offsets + Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1))); + float* keys_out = + (float*)((uint8_t*)keys_in + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); uint32_t* values_out = - (uint32_t*)((uint8_t*)keys_out + _cuann_aligned(sizeof(float) * sizeBatch * topK)); + (uint32_t*)((uint8_t*)keys_out + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); void* cub_ws = - (void*)((uint8_t*)values_out + _cuann_aligned(sizeof(uint32_t) * sizeBatch * topK)); + (void*)((uint8_t*)values_out + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK)); dim3 stpThreads(128, 1, 1); dim3 stpBlocks((max(sizeBatch + 1, sizeBatch * topK) + stpThreads.x - 1) / stpThreads.x, 1, 1); @@ -2422,7 +2326,7 @@ inline void _cuann_find_topk(const handle_t& handle, uint32_t* count = (uint32_t*)workspace; uint8_t* state = NULL; if (stateBitLen == 8) { - state = (uint8_t*)count + _cuann_aligned(sizeof(uint32_t) * sizeBatch * 2 * 256); + state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); } dim3 threads(numThreads, 1, 1); @@ -2728,13 +2632,13 @@ inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescript { size_t size = 0; // clusterLabelsOut [maxBatchSize, numProbes] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); // indexList [maxBatchSize * numProbes] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); // indexListSorted [maxBatchSize * numProbes] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); // numSamples [maxBatchSize,] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize); // cubWorkspace void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; @@ -2749,35 +2653,36 @@ inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescript d_values_in, d_values_out, desc->maxBatchSize * desc->numProbes); - desc->sizeCubWorkspace = _cuann_aligned(temp_storage_bytes); + desc->sizeCubWorkspace = Pow2<128>::roundUp(temp_storage_bytes); size += desc->sizeCubWorkspace; // chunkIndexPtr [maxBatchSize, numProbes] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); // topkSids [maxBatchSize, topk] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->topK); + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK); // similarity size_t unit_size = sizeof(float); if (desc->internalDistanceDtype == CUDA_R_16F) { unit_size = sizeof(half); } if (manage_local_topk(desc)) { // [matBatchSize, numProbes, topK] - size += _cuann_aligned(unit_size * desc->maxBatchSize * desc->numProbes * desc->topK); + size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->numProbes * desc->topK); } else { // [matBatchSize, maxSamples] - size += _cuann_aligned(unit_size * desc->maxBatchSize * desc->maxSamples); + size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->maxSamples); } // simTopkIndex if (manage_local_topk(desc)) { // [matBatchSize, numProbes, topk] - size += _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); + size += + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); } // topkScores if (manage_local_topk(desc)) { // [maxBatchSize, topk] - size += _cuann_aligned(sizeof(float) * desc->maxBatchSize * desc->topK); + size += Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK); } // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] size += - _cuann_aligned(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); // topkWorkspace if (manage_local_topk(desc)) { size += _cuann_find_topk_bufferSize(handle, @@ -3073,7 +2978,7 @@ inline void cuannPostprocessingMerge( inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t desc) { // [numClusters, dimDatasetExt] - return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimDatasetExt); + return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimDatasetExt); } inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) @@ -3081,41 +2986,41 @@ inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { // [dimPq, 1 << bitPq, lenPq] - return _cuann_aligned(desc->dimPq * size_base); + return Pow2<128>::roundUp(desc->dimPq * size_base); } else { // [numClusters, 1 << bitPq, lenPq] - return _cuann_aligned(desc->numClusters * size_base); + return Pow2<128>::roundUp(desc->numClusters * size_base); } } inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t desc) { // [numDataset, dimPq * bitPq / 8] - return _cuann_aligned(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); + return Pow2<128>::roundUp(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); } inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t desc) { // [numDataset,] - return _cuann_aligned(sizeof(uint32_t) * desc->numDataset); + return Pow2<128>::roundUp(sizeof(uint32_t) * desc->numDataset); } inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t desc) { // [numClusters + 1,] - return _cuann_aligned(sizeof(uint32_t) * (desc->numClusters + 1)); + return Pow2<128>::roundUp(sizeof(uint32_t) * (desc->numClusters + 1)); } inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t desc) { // [dimDataset, dimRotDataset] - return _cuann_aligned(sizeof(float) * desc->dimDataset * desc->dimRotDataset); + return Pow2<128>::roundUp(sizeof(float) * desc->dimDataset * desc->dimRotDataset); } inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t desc) { // [numClusters, dimRotDataset] - return _cuann_aligned(sizeof(float) * desc->numClusters * desc->dimRotDataset); + return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimRotDataset); } inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, @@ -3475,33 +3380,32 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, // resVectors[..] = newVectors[..] - clusterCenters[..] // if (dtype == CUDA_R_32F) { - _cuann_copy_with_list(clusterSize[l], - dimDataset, - (float*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset); + detail::utils::copy_selected(clusterSize[l], + dimDataset, + (float*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + handle.get_stream()); } else if (dtype == CUDA_R_8U) { - const float divisor = 256.0; - _cuann_copy_with_list(clusterSize[l], - dimDataset, - (uint8_t*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset, - divisor); + detail::utils::copy_selected(clusterSize[l], + dimDataset, + (uint8_t*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + handle.get_stream()); } else if (dtype == CUDA_R_8I) { - const float divisor = 128.0; - _cuann_copy_with_list(clusterSize[l], - dimDataset, - (int8_t*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset, - divisor); + detail::utils::copy_selected(clusterSize[l], + dimDataset, + (int8_t*)dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + handle.get_stream()); } _cuann_a_me_b(clusterSize[l], dimDataset, @@ -3975,33 +3879,32 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, assert(k == mesoClusterSize[i]); if (dtype == CUDA_R_32F) { - _cuann_copy_with_list(mesoClusterSize[i], - desc->dimDataset, - (const float*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset); + detail::utils::copy_selected(mesoClusterSize[i], + desc->dimDataset, + (const float*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + handle.get_stream()); } else if (dtype == CUDA_R_8U) { - float divisor = 256.0; - _cuann_copy_with_list(mesoClusterSize[i], - desc->dimDataset, - (const uint8_t*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - divisor); + detail::utils::copy_selected(mesoClusterSize[i], + desc->dimDataset, + (const uint8_t*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + handle.get_stream()); } else if (dtype == CUDA_R_8I) { - float divisor = 128.0; - _cuann_copy_with_list(mesoClusterSize[i], - desc->dimDataset, - (const int8_t*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - divisor); + detail::utils::copy_selected(mesoClusterSize[i], + desc->dimDataset, + (const int8_t*)trainset, + (const uint32_t*)(idsTrainset[devId]), + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + handle.get_stream()); } int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { @@ -4977,11 +4880,11 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, } size_t size_0 = - _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries - _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries - _cuann_aligned(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries - _cuann_aligned(sizeof(uint32_t) * maxQueries * desc->numProbes) + // clusterLabels.. - _cuann_aligned(sizeof(float) * maxQueries * desc->numClusters) + // QCDistances + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes) + // clusterLabels.. + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistances _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); if (size_0 > max_ws) { maxQueries = maxQueries * max_ws / size_0; @@ -4993,10 +4896,10 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, desc->maxQueries = maxQueries; *workspaceSize = - _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries - _cuann_aligned(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries - _cuann_aligned(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries - _cuann_aligned(sizeof(uint32_t) * maxQueries * desc->numProbes); // clusterLabels.. + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes); // clusterLabels.. max_ws -= *workspaceSize; desc->maxBatchSize = 1; @@ -5027,7 +4930,7 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, } size_t size_1 = - _cuann_aligned(sizeof(float) * maxQueries * desc->numClusters) + // QCDistance + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistance _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); size_t size_2 = ivfpq_search_bufferSize(handle, desc); *workspaceSize += max(size_1, size_2); @@ -5085,20 +4988,21 @@ inline void cuannIvfPqSearch( void* searchWorkspace; devQueries = (void*)workspace; curQueries = (float*)((uint8_t*)devQueries + - _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); rotQueries = (float*)((uint8_t*)curQueries + - _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); clusterLabelsToProbe = (uint32_t*)((uint8_t*)rotQueries + - _cuann_aligned(sizeof(float) * desc->maxQueries * desc->dimRotDataset)); + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimRotDataset)); // QCDistances = (float*)((uint8_t*)clusterLabelsToProbe + - _cuann_aligned(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); topkWorkspace = (void*)((uint8_t*)QCDistances + - _cuann_aligned(sizeof(float) * desc->maxQueries * desc->numClusters)); + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->numClusters)); // - searchWorkspace = (void*)((uint8_t*)clusterLabelsToProbe + - _cuann_aligned(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + searchWorkspace = + (void*)((uint8_t*)clusterLabelsToProbe + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); void (*_ivfpq_search)(const handle_t&, cuannIvfPqDescriptor_t, @@ -6136,39 +6040,43 @@ inline void ivfpq_search(const handle_t& handle, void* topkWorkspace; clusterLabelsOut = (uint32_t*)workspace; - indexList = (uint32_t*)((uint8_t*)clusterLabelsOut + - _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + indexList = + (uint32_t*)((uint8_t*)clusterLabelsOut + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); indexListSorted = (uint32_t*)((uint8_t*)indexList + - _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - numSamples = (uint32_t*)((uint8_t*)indexListSorted + - _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + numSamples = + (uint32_t*)((uint8_t*)indexListSorted + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); cubWorkspace = - (void*)((uint8_t*)numSamples + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize)); + (void*)((uint8_t*)numSamples + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize)); chunkIndexPtr = (uint32_t*)((uint8_t*)cubWorkspace + desc->sizeCubWorkspace); - topkSids = (uint32_t*)((uint8_t*)chunkIndexPtr + - _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - similarity = (scoreDtype*)((uint8_t*)topkSids + - _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); + topkSids = + (uint32_t*)((uint8_t*)chunkIndexPtr + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + similarity = + (scoreDtype*)((uint8_t*)topkSids + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); if (manage_local_topk(desc)) { topkScores = - (float*)((uint8_t*)similarity + _cuann_aligned(sizeof(scoreDtype) * desc->maxBatchSize * - desc->numProbes * desc->topK)); + (float*)((uint8_t*)similarity + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * + desc->numProbes * desc->topK)); simTopkIndex = (uint32_t*)((uint8_t*)topkScores + - _cuann_aligned(sizeof(float) * desc->maxBatchSize * desc->topK)); + Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK)); preCompScores = - (float*)((uint8_t*)simTopkIndex + _cuann_aligned(sizeof(uint32_t) * desc->maxBatchSize * - desc->numProbes * desc->topK)); + (float*)((uint8_t*)simTopkIndex + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * + desc->numProbes * desc->topK)); } else { topkScores = NULL; simTopkIndex = NULL; preCompScores = (float*)((uint8_t*)similarity + - _cuann_aligned(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); } topkWorkspace = - (void*)((uint8_t*)preCompScores + _cuann_aligned(sizeof(float) * getMultiProcessorCount() * - desc->dimPq * (1 << desc->bitPq))); + (void*)((uint8_t*)preCompScores + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * + desc->dimPq * (1 << desc->bitPq))); // if (manage_local_topk(desc)) { From 4d11178833ecaed35dcdfd2f01f0c4b54500a3c0 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 10 Aug 2022 15:53:26 +0200 Subject: [PATCH 012/140] Wrap remaining cuda calls into RAFT_CUDA_TRY and replace asserts with RAFT_EXPECTS --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 258 ++++++++++++------------ 1 file changed, 134 insertions(+), 124 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 4a7a924fbe..f3bb852817 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -292,8 +292,8 @@ inline void _cuann_copy_fill(uint32_t nRows, D divisor, cudaStream_t stream) { - assert(ldSrc >= nCols); - assert(ldDst >= nCols); + RAFT_EXPECTS(ldSrc >= nCols, "src leading dimension must be larger than nCols"); + RAFT_EXPECTS(ldDst >= nCols, "dist leading dimension must be larger than nCols"); uint32_t nThreads = 128; uint32_t nBlocks = ((nRows * ldDst) + nThreads - 1) / nThreads; kern_copy_fill @@ -523,7 +523,7 @@ template inline void _cuann_multi_device_free(T** arrays, int numDevices) { for (int devId = 0; devId < numDevices; devId++) { - cudaFree(arrays[devId]); + RAFT_CUDA_TRY(cudaFree(arrays[devId])); } free(arrays); } @@ -580,8 +580,8 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen numDataset, dimCenters, centers, clusterSize, (const int8_t*)dataset, labels, stream); } } else { - cudaMemcpy( - centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault); + RAFT_CUDA_TRY(cudaMemcpy( + centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault)); } if (similarity == CUANN_SIMILARITY_INNER) { @@ -686,7 +686,7 @@ inline void _cuann_kmeans_predict(const handle_t& handle, cudaMemcpyKind kind; cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dataset); + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { kind = cudaMemcpyDeviceToDevice; } else { @@ -786,7 +786,7 @@ inline void _cuann_kmeans_predict(const handle_t& handle, tempCenters); } - if (_workspace == NULL) { cudaFree(workspace); } + if (_workspace == NULL) { RAFT_CUDA_TRY(cudaFree(workspace)); } } // @@ -826,15 +826,15 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, numDevices, sizePredictWorkspace, "predictWorkspaceMP"); int orgDevId; - cudaGetDevice(&orgDevId); + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); #pragma omp parallel num_threads(numDevices) { int devId = omp_get_thread_num(); - cudaSetDevice(devId); - cudaMemcpy(clusterCentersCopy[devId], - clusterCenters, - sizeof(float) * numCenters * dimCenters, - cudaMemcpyDefault); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaMemcpy(clusterCentersCopy[devId], + clusterCenters, + sizeof(float) * numCenters * dimCenters, + cudaMemcpyDefault)); uint64_t d0 = (uint64_t)numDataset * (devId) / numDevices; uint64_t d1 = (uint64_t)numDataset * (devId + 1) / numDevices; uint64_t nDataset = d1 - d0; @@ -863,10 +863,10 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, } for (int devId = 0; devId < numDevices; devId++) { // Barrier - cudaSetDevice(devId); - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); } - cudaSetDevice(orgDevId); + RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); auto stream = handle.get_stream(); if (clusterSize != NULL) { // Reduce results to main thread @@ -1033,14 +1033,14 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] ofst = primes[iPrimes]; } while (numDataset % ofst == 0); - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dataset); + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { // GPU uint32_t* count; if (ws == NULL) { - cudaMallocManaged(&count, sizeof(uint32_t)); + RAFT_CUDA_TRY(cudaMallocManaged(&count, sizeof(uint32_t))); } else { count = (uint32_t*)ws; } @@ -1078,9 +1078,9 @@ bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] average, ofst, count); - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); if (count[0] > 0) { adjusted = true; } - if (ws == NULL) { cudaFree(count); } + if (ws == NULL) { RAFT_CUDA_TRY(cudaFree(count)); } } else { // CPU uint32_t i = 0; @@ -2092,7 +2092,7 @@ inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); + static_assert(stateBitLen == 0 || stateBitLen == 8); size_t workspaceSize = 0; // count @@ -2161,9 +2161,10 @@ inline void _cuann_find_topk(const handle_t& handle, { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); + static_assert(stateBitLen == 0 || stateBitLen == 8); #ifdef CUANN_DEBUG - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream()); + RAFT_CUDA_TRY( + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); #endif // Limit the maximum value of vecLen to 4. In the case of FP32, @@ -2180,8 +2181,8 @@ inline void _cuann_find_topk(const handle_t& handle, int numBlocksPerSm_topk; size_t dynamicSMemSize = 0; - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize); + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize)); int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); int numBlocks = min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); @@ -2227,7 +2228,8 @@ inline void _cuann_find_topk(const handle_t& handle, args[6] = {&(labels)}; args[7] = {&(count)}; args[8] = {nullptr}; - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream()); + RAFT_CUDA_TRY( + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); } if (!sort) { return; } @@ -2273,11 +2275,11 @@ inline void _cuann_find_topk(const handle_t& handle, (int)(sizeof(float) * 8), handle.get_stream()); - cudaMemcpyAsync(labels, - values_out, - sizeof(uint32_t) * sizeBatch * topK, - cudaMemcpyDeviceToDevice, - handle.get_stream()); + RAFT_CUDA_TRY(cudaMemcpyAsync(labels, + values_out, + sizeof(uint32_t) * sizeBatch * topK, + cudaMemcpyDeviceToDevice, + handle.get_stream())); } // @@ -2293,9 +2295,10 @@ inline void _cuann_find_topk(const handle_t& handle, { constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); + static_assert(stateBitLen == 0 || stateBitLen == 8); #ifdef CUANN_DEBUG - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream()); + RAFT_CUDA_TRY( + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); #endif int vecLen = _get_vecLen(maxSamples); @@ -2311,7 +2314,8 @@ inline void _cuann_find_topk(const handle_t& handle, } int numBlocksPerSm_topk; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0); + RAFT_CUDA_TRY( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0)); int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); int numBlocks = min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); @@ -2357,7 +2361,8 @@ inline void _cuann_find_topk(const handle_t& handle, args[6] = {&(labels)}; args[7] = {&(count)}; args[8] = {nullptr}; - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream()); + RAFT_CUDA_TRY( + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); } } @@ -2809,7 +2814,7 @@ inline void ivfpq_encode(uint32_t numDataset, numDataset, ldDataset, dimPq, bitPq, label, output); #else // CPU - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); for (uint32_t i = 0; i < numDataset; i++) { ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); } @@ -3106,7 +3111,7 @@ inline void _cuann_get_inclusiveSumSortedClusterSize( for (uint32_t i = 1; i < desc->numClusters; i++) { (*output)[i] += (*output)[i - 1]; } - assert((*output)[desc->numClusters - 1] == desc->numDataset); + RAFT_EXPECTS((*output)[desc->numClusters - 1] == desc->numDataset, "cluster sizes do not add up"); } inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, @@ -3114,7 +3119,7 @@ inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, float** output // [numClusters,] ) { - if (*output != NULL) { cudaFree(*output); } + if (*output != NULL) { RAFT_CUDA_TRY(cudaFree(*output)); } RAFT_CUDA_TRY(cudaMallocManaged(output, sizeof(float) * desc->numClusters)); switch (detail::utils::check_pointer_residency(clusterCenters, *output)) { case detail::utils::pointer_residency::device_only: @@ -3164,8 +3169,10 @@ inline void _cuann_make_rotation_matrix(uint32_t nRows, float* rotationMatrix // [nRows, nCols] ) { - assert(nRows >= nCols); - assert(nRows % lenPq == 0); + RAFT_EXPECTS( + nRows >= nCols, "number of rows (%u) must be larger than number or cols (%u)", nRows, nCols); + RAFT_EXPECTS( + nRows % lenPq == 0, "number of rows (%u) must be a multiple of lenPq (%u)", nRows, lenPq); if (randomRotation) { RAFT_LOG_DEBUG("Creating a random rotation matrix."); @@ -3368,7 +3375,7 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, #pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t l = 0; l < numClusters; l++) { int devId = omp_get_thread_num(); - cudaSetDevice(devId); + RAFT_CUDA_TRY(cudaSetDevice(devId)); if (devId == 0) { fprintf(stderr, "(%s) Making PQ dataset: %u / %u \r", __func__, l, numClusters); } @@ -3481,10 +3488,10 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, iter -= 1; } } - cudaMemcpy(pqCenters + ((1 << bitPq) * lenPq) * l, - myPqCenters[devId], - sizeof(float) * (1 << bitPq) * lenPq, - cudaMemcpyDeviceToHost); + RAFT_CUDA_TRY(cudaMemcpy(pqCenters + ((1 << bitPq) * lenPq) * l, + myPqCenters[devId], + sizeof(float) * (1 << bitPq) * lenPq, + cudaMemcpyDeviceToHost)); } // @@ -3536,13 +3543,12 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, // ivfpq_encode( clusterSize[l], clusterSize[l], dimPq, bitPq, subVectorLabels[devId], myPqDataset[devId]); - cudaMemcpy(pqDataset + ((uint64_t)indexPtr[l] * dimPq * bitPq / 8), - myPqDataset[devId], - sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, - cudaMemcpyDeviceToHost); - // cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaMemcpy(pqDataset + ((uint64_t)indexPtr[l] * dimPq * bitPq / 8), + myPqDataset[devId], + sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, + cudaMemcpyDeviceToHost)); } - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); fprintf(stderr, "\n"); // @@ -3586,7 +3592,7 @@ inline void cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) inline void cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - if (desc->sqsumClusters != NULL) { cudaFree(desc->sqsumClusters); } + if (desc->sqsumClusters != NULL) { RAFT_CUDA_TRY(cudaFree(desc->sqsumClusters)); } free(desc); } @@ -3625,8 +3631,8 @@ inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, desc->dimDataset = dimDataset; desc->dimDatasetExt = dimDataset + 1; if (desc->dimDatasetExt % 8) { desc->dimDatasetExt += 8 - (desc->dimDatasetExt % 8); } - assert(desc->dimDatasetExt >= dimDataset + 1); - assert(desc->dimDatasetExt % 8 == 0); + RAFT_EXPECTS(desc->dimDatasetExt >= dimDataset + 1, "unexpected dimDatasetExt"); + RAFT_EXPECTS(desc->dimDatasetExt % 8 == 0, "unexpected dimDatasetExt"); desc->dimPq = dimPq; desc->bitPq = bitPq; desc->similarity = similarity; @@ -3799,7 +3805,7 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= 1; } } - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // Number of centers in each meso cluster // [numMesoClusters,] @@ -3829,8 +3835,9 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]); numFineClustersMax = max(numFineClustersMax, numFineClusters[i]); } - assert(mesoClusterSizeSum == numTrainset); - assert(csumFineClusters[numMesoClusters] == desc->numClusters); + RAFT_EXPECTS(mesoClusterSizeSum == numTrainset, "mesocluster sizes do not add up"); + RAFT_EXPECTS(csumFineClusters[numMesoClusters] == desc->numClusters, + "fine cluster sizes do not add up"); uint32_t** idsTrainset = _cuann_multi_device_malloc(1, mesoClusterSizeMax, "idsTrainset"); @@ -3869,14 +3876,14 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, #pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t i = 0; i < numMesoClusters; i++) { int devId = omp_get_thread_num(); - cudaSetDevice(devId); + RAFT_CUDA_TRY(cudaSetDevice(devId)); uint32_t k = 0; for (uint32_t j = 0; j < numTrainset; j++) { if (mesoClusterLabels[j] != i) continue; idsTrainset[devId][k++] = j; } - assert(k == mesoClusterSize[i]); + RAFT_EXPECTS(k == mesoClusterSize[i], "unexpected cluster size for cluster %u", i); if (dtype == CUDA_R_32F) { detail::utils::copy_selected(mesoClusterSize[i], @@ -3936,16 +3943,16 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= 1; } } - cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), - clusterCentersEach[devId], - sizeof(float) * numFineClusters[i] * desc->dimDataset, - cudaMemcpyDeviceToDevice); + RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), + clusterCentersEach[devId], + sizeof(float) * numFineClusters[i] * desc->dimDataset, + cudaMemcpyDeviceToDevice)); } for (int devId = 0; devId < 1; devId++) { - cudaSetDevice(devId); - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); } - cudaSetDevice(cuannDevId); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); _cuann_multi_device_free(idsTrainset, 1); _cuann_multi_device_free(subTrainset, 1); @@ -4055,7 +4062,7 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, true /* to update clusterCenters */); #ifdef CUANN_DEBUG - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); _cuann_kmeans_show_centers(clusterCenters, desc->numClusters, desc->dimDataset, clusterSize); #endif @@ -4206,13 +4213,13 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, #pragma omp parallel for schedule(dynamic) num_threads(1) for (uint32_t j = 0; j < desc->dimPq; j++) { int devId = omp_get_thread_num(); - cudaSetDevice(devId); + RAFT_CUDA_TRY(cudaSetDevice(devId)); float* curPqCenters = pqCenters + ((1 << desc->bitPq) * desc->lenPq) * j; - cudaMemcpy(subTrainset[devId], - modTrainset + ((uint64_t)numTrainset * desc->lenPq * j), - sizeof(float) * numTrainset * desc->lenPq, - cudaMemcpyHostToDevice); + RAFT_CUDA_TRY(cudaMemcpy(subTrainset[devId], + modTrainset + ((uint64_t)numTrainset * desc->lenPq * j), + sizeof(float) * numTrainset * desc->lenPq, + cudaMemcpyHostToDevice)); // Train kmeans for each PQ int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { @@ -4254,20 +4261,20 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, iter -= 1; } } - cudaMemcpy(curPqCenters, - pqCentersEach[devId], - sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), - cudaMemcpyDeviceToDevice); + RAFT_CUDA_TRY(cudaMemcpy(curPqCenters, + pqCentersEach[devId], + sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), + cudaMemcpyDeviceToDevice)); #ifdef CUANN_DEBUG if (j == 0) { - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); _cuann_kmeans_show_centers( curPqCenters, (1 << desc->bitPq), desc->lenPq, pqClusterSize[devId]); } #endif } fprintf(stderr, "\n"); - cudaSetDevice(cuannDevId); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); _cuann_multi_device_free(subTrainset, 1); _cuann_multi_device_free(subTrainsetLabels, 1); @@ -4298,7 +4305,7 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, pqCenters, numIterations, pqDataset); - cudaSetDevice(cuannDevId); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); // _cuann_get_inclusiveSumSortedClusterSize( @@ -4307,7 +4314,7 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, { // combine clusterCenters and sqsumClusters - cudaDeviceSynchronize(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); float* tmpClusterCenters; // [numClusters, dimDataset] RAFT_CUDA_TRY( cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset)); @@ -4321,7 +4328,7 @@ inline void cuannIvfPqBuildIndex(const handle_t& handle, } clusterCenters[desc->dimDataset + (desc->dimDatasetExt * i)] = desc->sqsumClusters[i]; } - cudaFree(tmpClusterCenters); + RAFT_CUDA_TRY(cudaFree(tmpClusterCenters)); } // @@ -4398,7 +4405,7 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, } fclose(fp); - cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle.get_device()); + RAFT_CUDA_TRY(cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); } struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(*index); @@ -4443,33 +4450,33 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, // pqDataset size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; if (size < handle.get_device_properties().totalGlobalMem) { - cudaMemPrefetchAsync(pqDataset, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqDataset, size, handle.get_device())); } // clusterCenters size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; - cudaMemPrefetchAsync(clusterCenters, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterCenters, size, handle.get_device())); // pqCenters if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; } else { size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; } - cudaMemPrefetchAsync(pqCenters, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqCenters, size, handle.get_device())); // originalNumbers size = sizeof(uint32_t) * desc->numDataset; - cudaMemPrefetchAsync(originalNumbers, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(originalNumbers, size, handle.get_device())); // indexPtr size = sizeof(uint32_t) * (desc->numClusters + 1); - cudaMemPrefetchAsync(indexPtr, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(indexPtr, size, handle.get_device())); // rotationMatrix if (rotationMatrix != NULL) { size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; - cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device())); } // clusterRotCenters if (clusterRotCenters != NULL) { size = sizeof(float) * desc->numClusters * desc->dimRotDataset; - cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device()); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device())); } _cuann_set_device(orgDevId); @@ -4543,10 +4550,10 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // uint32_t* newVectorLabels; // [numNewVectors,] RAFT_CUDA_TRY(cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors)); - cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors); + RAFT_CUDA_TRY(cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors)); uint32_t* clusterSize; // [numClusters,] RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters)); - cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters); + RAFT_CUDA_TRY(cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters)); _cuann_kmeans_predict_MP(handle, clusterCenters, oldDesc->numClusters, @@ -4644,7 +4651,7 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( oldPqCenters, 0, pqDataset); - cudaSetDevice(cuannDevId); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); // // Create descriptor for new index @@ -5042,7 +5049,7 @@ inline void cuannIvfPqSearch( } cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, queries); + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, queries)); for (uint32_t i = 0; i < numQueries; i += desc->maxQueries) { uint32_t nQueries = min(desc->maxQueries, numQueries - i); @@ -5058,11 +5065,11 @@ inline void cuannIvfPqSearch( if (dtype == CUDA_R_32F) { float* ptrQueries = (float*)queries + ((uint64_t)(desc->dimDataset) * i); if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(float) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream()); + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(float) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); ptrQueries = (float*)devQueries; } _cuann_copy_fill(nQueries, @@ -5077,11 +5084,11 @@ inline void cuannIvfPqSearch( } else if (dtype == CUDA_R_8U) { uint8_t* ptrQueries = (uint8_t*)queries + ((uint64_t)(desc->dimDataset) * i); if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(uint8_t) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream()); + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(uint8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); ptrQueries = (uint8_t*)devQueries; } _cuann_copy_fill(nQueries, @@ -5096,11 +5103,11 @@ inline void cuannIvfPqSearch( } else if (dtype == CUDA_R_8I) { int8_t* ptrQueries = (int8_t*)queries + ((uint64_t)(desc->dimDataset) * i); if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(int8_t) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream()); + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(int8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); ptrQueries = (int8_t*)devQueries; } _cuann_copy_fill(nQueries, @@ -5124,7 +5131,7 @@ inline void cuannIvfPqSearch( alpha = -2.0; beta = 0.0; gemmK = desc->dimDataset + 1; - assert(gemmK <= desc->dimDatasetExt); + RAFT_EXPECTS(gemmK <= desc->dimDatasetExt, "unexpected gemmK or dimDatasetExt"); } linalg::gemm(handle, true, @@ -6023,7 +6030,10 @@ inline void ivfpq_search(const handle_t& handle, float* topkDistances, // [numQueries, topK] void* workspace) { - assert(numQueries <= desc->maxBatchSize); + RAFT_EXPECTS(numQueries <= desc->maxBatchSize, + "number of queries (%u) must be smaller the max batch size (%u)", + numQueries, + desc->maxBatchSize); uint32_t* clusterLabelsOut; // [maxBatchSize, numProbes] uint32_t* indexList; // [maxBatchSize * numProbes] @@ -6133,24 +6143,24 @@ inline void ivfpq_search(const handle_t& handle, // Select a GPU kernel for distance calculation #define SET_KERNEL1(B, V, T, D) \ do { \ - assert((B * V) % (sizeof(T) * 8) == 0); \ + static_assert((B * V) % (sizeof(T) * 8) == 0); \ kernel_no_basediff = ivfpq_compute_similarity; \ kernel_fast = ivfpq_compute_similarity; \ kernel_no_smem_lut = ivfpq_compute_similarity_no_smem_lut; \ } while (0) -#define SET_KERNEL2(B, M, D) \ - do { \ - assert(desc->dimPq % M == 0); \ - if (desc->dimPq % (M * 8) == 0) { \ - SET_KERNEL1(B, (M * 8), uint64_t, D); \ - } else if (desc->dimPq % (M * 4) == 0) { \ - SET_KERNEL1(B, (M * 4), uint32_t, D); \ - } else if (desc->dimPq % (M * 2) == 0) { \ - SET_KERNEL1(B, (M * 2), uint16_t, D); \ - } else if (desc->dimPq % (M * 1) == 0) { \ - SET_KERNEL1(B, (M * 1), uint8_t, D); \ - } \ +#define SET_KERNEL2(B, M, D) \ + do { \ + RAFT_EXPECTS(desc->dimPq % M == 0, "dimPq must be a multiple of %u", M); \ + if (desc->dimPq % (M * 8) == 0) { \ + SET_KERNEL1(B, (M * 8), uint64_t, D); \ + } else if (desc->dimPq % (M * 4) == 0) { \ + SET_KERNEL1(B, (M * 4), uint32_t, D); \ + } else if (desc->dimPq % (M * 2) == 0) { \ + SET_KERNEL1(B, (M * 2), uint16_t, D); \ + } else if (desc->dimPq % (M * 1) == 0) { \ + SET_KERNEL1(B, (M * 1), uint8_t, D); \ + } \ } while (0) #define SET_KERNEL3(D) \ From 267b35eb113e24e412c645afaea627b3f7c72d60 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 11 Aug 2022 11:30:26 +0200 Subject: [PATCH 013/140] WIP: templatizing the api --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 92 ++++++++++++------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index f3bb852817..0bedbede24 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -2890,18 +2890,6 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size /* bytes of dataset index */); -inline void cuannIvfPqBuildIndex( - const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* dataset, /* [numDataset, dimDataset] */ - const void* trainset, /* [numTrainset, dimDataset] */ - cudaDataType_t dtype, - uint32_t numTrainset, /* Number of train-set entries */ - uint32_t numIterations, /* Number of iterations to train kmeans */ - bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ - bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ - void* index /* database index to build */); - inline void cuannIvfPqSaveIndex(const handle_t& handle, cuannIvfPqDescriptor_t desc, const void* index, @@ -2945,16 +2933,6 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, size_t maxWorkspaceSize, size_t* workspaceSize); -inline void cuannIvfPqSearch(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const void* queries, /* [numQueries, dimDataset] */ - cudaDataType_t dtype, - uint32_t numQueries, - uint64_t* neighbors, /* [numQueries, topK] */ - float* distances, /* [numQueries, topK] */ - void* workspace); - inline void cuannPostprocessingRefine(uint32_t numDataset, uint32_t numQueries, uint32_t dimDataset, @@ -3680,23 +3658,33 @@ inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) *size += _cuann_getIndexSize_clusterRotCenters(desc); } -// cuannIvfPqBuildIndex -inline void cuannIvfPqBuildIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* dataset, - const void* trainset, - cudaDataType_t dtype, - uint32_t numTrainset, - uint32_t numIterations, - bool randomRotation, - bool hierarchicalClustering, - void* index) +template +void cuannIvfPqBuildIndex( + const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const T* dataset, /* [numDataset, dimDataset] */ + const T* trainset, /* [numTrainset, dimDataset] */ + uint32_t numTrainset, /* Number of train-set entries */ + uint32_t numIterations, /* Number of iterations to train kmeans */ + bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ + bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ + void* index /* database index to build */) { int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); - RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, - "Unsupported dtype"); + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8U; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8I; + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } if (desc->similarity == CUANN_SIMILARITY_INNER) { RAFT_EXPECTS(dtype == CUDA_R_32F, "Unsupported dtype (inner-product metric support float only)"); @@ -4948,23 +4936,31 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, "workspaceSize: %lu (%.3f GiB)", *workspaceSize, (float)*workspaceSize / 1024 / 1024 / 1024); } -// cuannIvfPqSearch -inline void cuannIvfPqSearch( - const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const void* queries, // [numQueries, dimDataset], host or device pointer - cudaDataType_t dtype, - uint32_t numQueries, - uint64_t* neighbors, // [numQueries, topK], device pointer - float* distances, // [numQueries, topK], device pointer - void* workspace) +template +void cuannIvfPqSearch(const handle_t& handle, + cuannIvfPqDescriptor_t desc, + const void* index, + const T* queries, /* [numQueries, dimDataset], host or device pointer */ + uint32_t numQueries, + uint64_t* neighbors, /* [numQueries, topK], device pointer */ + float* distances, /* [numQueries, topK], device pointer */ + void* workspace) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); int orgDevId = _cuann_set_device(handle.get_device()); - RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, - "unsupported dtype"); + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8U; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8I; + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } struct cuannIvfPqIndexHeader* header; float* clusterCenters; // [numClusters, dimDatasetExt] From 2541250d2aecb05c7638d9ba685b1a97ce890534 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 11 Aug 2022 12:38:10 +0200 Subject: [PATCH 014/140] Add more templates and use adjust_centers --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 473 +++++++++--------------- cpp/test/spatial/ann_ivf_pq.cu | 15 +- 2 files changed, 171 insertions(+), 317 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 0bedbede24..4621a88663 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -1003,127 +1003,6 @@ __global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] } } -// adjust centers which have small number of entries -bool _cuann_kmeans_adjust_centers(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - const uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, - const uint32_t* clusterSize, // [numCenters] - float threshold, - void* ws) -{ - RAFT_EXPECTS(dtype == CUDA_R_32F || dtype == CUDA_R_8U || dtype == CUDA_R_8I, - "Unsupported dtype (%d)", - dtype); - bool adjusted = false; - static uint32_t iPrimes = 0; - constexpr uint32_t numPrimes = 40; - uint32_t primes[numPrimes] = {29, 71, 113, 173, 229, 281, 349, 409, 463, 541, - 601, 659, 733, 809, 863, 941, 1013, 1069, 1151, 1223, - 1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987, - 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741}; - uint32_t average = (numDataset + numCenters - 1) / numCenters; - uint32_t ofst; - do { - iPrimes = (iPrimes + 1) % numPrimes; - ofst = primes[iPrimes]; - } while (numDataset % ofst == 0); - - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); - if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { - // GPU - uint32_t* count; - if (ws == NULL) { - RAFT_CUDA_TRY(cudaMallocManaged(&count, sizeof(uint32_t))); - } else { - count = (uint32_t*)ws; - } - count[0] = 0; - void (*kernel)(float*, - uint32_t, - uint32_t, - const void*, - uint32_t, - const uint32_t*, - cuannSimilarity_t, - const uint32_t*, - float, - uint32_t, - uint32_t, - uint32_t*); - if (dtype == CUDA_R_32F) { - kernel = kern_adjust_centers; - } else if (dtype == CUDA_R_8U) { - kernel = kern_adjust_centers; - } else if (dtype == CUDA_R_8I) { - kernel = kern_adjust_centers; - } - dim3 threads(32, 4, 1); - dim3 blocks(1, (numCenters + threads.y - 1) / threads.y, 1); - kernel<<>>(centers, - numCenters, - dimCenters, - dataset, - numDataset, - labels, - similarity, - clusterSize, - threshold, - average, - ofst, - count); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - if (count[0] > 0) { adjusted = true; } - if (ws == NULL) { RAFT_CUDA_TRY(cudaFree(count)); } - } else { - // CPU - uint32_t i = 0; - uint32_t count = 0; - for (uint32_t l = 0; l < numCenters; l++) { - if (clusterSize[l] > (uint32_t)(average * threshold)) continue; - do { - i = (i + ofst) % numDataset; - } while (clusterSize[labels[i]] < average); - uint32_t li = labels[i]; - float sqsum = 0.0; - for (uint32_t j = 0; j < dimCenters; j++) { - float val = centers[j + ((uint64_t)dimCenters * li)] * (R_FACTOR - 1); - if (dtype == CUDA_R_32F) { - val += ((float*)dataset)[j + ((uint64_t)dimCenters * i)]; - } else if (dtype == CUDA_R_8U) { - float divisor = 256.0; - val += ((uint8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor; - } else if (dtype == CUDA_R_8I) { - float divisor = 128.0; - val += ((int8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor; - } - val /= R_FACTOR; - sqsum += val * val; - centers[j + ((uint64_t)dimCenters * l)] = val; - } - if (similarity == CUANN_SIMILARITY_INNER) { - sqsum = sqrt(sqsum); - for (uint32_t j = 0; j < dimCenters; j++) { - centers[j + ((uint64_t)dimCenters * l)] /= sqsum; - } - } - count += 1; - } - if (count > 0) { - adjusted = true; - RAFT_LOG_DEBUG( - "num adjusted: %u / %u, threshold: %d \n", count, numCenters, (int)(average * threshold)); - } - } - return adjusted; -} - /** * end of kmeans * @@ -3287,28 +3166,34 @@ uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) } // -inline void _cuann_compute_PQ_code(const handle_t& handle, - uint32_t numDataset, - uint32_t dimDataset, - uint32_t dimRotDataset, - uint32_t dimPq, - uint32_t lenPq, - uint32_t bitPq, - uint32_t numClusters, - cudaDataType_t dtype, - cuannPqCenter_t typePqCenter, - uint32_t maxClusterSize, - float* clusterCenters, // [numClusters, dimDataset] - const float* rotationMatrix, // [dimRotDataset, dimDataset] - const void* dataset, // [numDataset] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterSize, // [numClusters] - const uint32_t* indexPtr, // [numClusters + 1] - float* pqCenters, // [...] - uint32_t numIterations, - uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] +template +void _cuann_compute_PQ_code(const handle_t& handle, + uint32_t numDataset, + uint32_t dimDataset, + uint32_t dimRotDataset, + uint32_t dimPq, + uint32_t lenPq, + uint32_t bitPq, + uint32_t numClusters, + cuannPqCenter_t typePqCenter, + uint32_t maxClusterSize, + float* clusterCenters, // [numClusters, dimDataset] + const float* rotationMatrix, // [dimRotDataset, dimDataset] + const T* dataset, // [numDataset] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterSize, // [numClusters] + const uint32_t* indexPtr, // [numClusters + 1] + float* pqCenters, // [...] + uint32_t numIterations, + uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] ) { + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("_cuann_compute_PQ_code: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } // // Compute PQ code // @@ -3364,34 +3249,14 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, // centroids. // resVectors[..] = newVectors[..] - clusterCenters[..] // - if (dtype == CUDA_R_32F) { - detail::utils::copy_selected(clusterSize[l], - dimDataset, - (float*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset, - handle.get_stream()); - } else if (dtype == CUDA_R_8U) { - detail::utils::copy_selected(clusterSize[l], - dimDataset, - (uint8_t*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset, - handle.get_stream()); - } else if (dtype == CUDA_R_8I) { - detail::utils::copy_selected(clusterSize[l], - dimDataset, - (int8_t*)dataset, - originalNumbers + indexPtr[l], - dimDataset, - resVectors[devId], - dimDataset, - handle.get_stream()); - } + detail::utils::copy_selected(clusterSize[l], + dimDataset, + dataset, + originalNumbers + indexPtr[l], + dimDataset, + resVectors[devId], + dimDataset, + handle.get_stream()); _cuann_a_me_b(clusterSize[l], dimDataset, resVectors[devId], @@ -3452,17 +3317,16 @@ inline void _cuann_compute_PQ_code(const handle_t& handle, myPqCentersTemp[devId], pqClusterSize[devId], true); - if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(myPqCenters[devId], - (1 << bitPq), - lenPq, - rotVectors[devId], - CUDA_R_32F, - numTrainset, - rotVectorLabels[devId], - CUANN_SIMILARITY_L2, - pqClusterSize[devId], - (float)1.0 / 4, - wsKAC[devId])) { + if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(myPqCenters[devId], + (1 << bitPq), + lenPq, + rotVectors[devId], + numTrainset, + rotVectorLabels[devId], + pqClusterSize[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { iter -= 1; } } @@ -3690,6 +3554,13 @@ void cuannIvfPqBuildIndex( "Unsupported dtype (inner-product metric support float only)"); } + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("cuannIvfPqBuildIndex: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + desc->dtypeDataset = dtype; char dtypeString[64]; _cuann_get_dtype_string(desc->dtypeDataset, dtypeString); @@ -3779,18 +3650,21 @@ void cuannIvfPqBuildIndex( mesoClusterCentersTemp, mesoClusterSize, true); - if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(mesoClusterCenters, - numMesoClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - mesoClusterLabels, - desc->similarity, - mesoClusterSize, - (float)1.0 / 4, - nullptr)) { + if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(mesoClusterCenters, + numMesoClusters, + desc->dimDataset, + trainset, + numTrainset, + mesoClusterLabels, + mesoClusterSize, + (float)1.0 / 4, + device_memory, + handle.get_stream())) { iter -= 1; + if (desc->similarity == CUANN_SIMILARITY_INNER) { + detail::utils::normalize_rows( + numMesoClusters, desc->dimDataset, mesoClusterCenters, handle.get_stream()); + } } } RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -3873,34 +3747,15 @@ void cuannIvfPqBuildIndex( } RAFT_EXPECTS(k == mesoClusterSize[i], "unexpected cluster size for cluster %u", i); - if (dtype == CUDA_R_32F) { - detail::utils::copy_selected(mesoClusterSize[i], - desc->dimDataset, - (const float*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - handle.get_stream()); - } else if (dtype == CUDA_R_8U) { - detail::utils::copy_selected(mesoClusterSize[i], - desc->dimDataset, - (const uint8_t*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - handle.get_stream()); - } else if (dtype == CUDA_R_8I) { - detail::utils::copy_selected(mesoClusterSize[i], - desc->dimDataset, - (const int8_t*)trainset, - (const uint32_t*)(idsTrainset[devId]), - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - handle.get_stream()); - } + detail::utils::copy_selected(mesoClusterSize[i], + desc->dimDataset, + trainset, + idsTrainset[devId], + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + handle.get_stream()); + int numIterations_2 = numIterations * 2; for (int iter = 0; iter < numIterations_2; iter += 2) { _cuann_kmeans_predict(handle, @@ -3917,18 +3772,22 @@ void cuannIvfPqBuildIndex( clusterCentersMP[devId], clusterSizeMP[devId], true); - if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(clusterCentersEach[devId], - numFineClusters[i], - desc->dimDataset, - subTrainset[devId], - CUDA_R_32F, - mesoClusterSize[i], - labelsMP[devId], - desc->similarity, - clusterSizeMP[devId], - (float)1.0 / 4, - wsKAC[devId])) { + if ((iter + 1 < numIterations_2) && + detail::kmeans::adjust_centers(clusterCentersEach[devId], + numFineClusters[i], + desc->dimDataset, + subTrainset[devId], + mesoClusterSize[i], + labelsMP[devId], + clusterSizeMP[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { iter -= 1; + if (desc->similarity == CUANN_SIMILARITY_INNER) { + detail::utils::normalize_rows( + numFineClusters[i], desc->dimDataset, clusterCentersEach[devId], handle.get_stream()); + } } } RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), @@ -3982,18 +3841,21 @@ void cuannIvfPqBuildIndex( true, clusterSize, true /* to update clusterCenters */); - if ((iter + 1 < numIterations_X) && _cuann_kmeans_adjust_centers(clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - trainsetLabels, - desc->similarity, - clusterSize, - (float)1.0 / 5, - nullptr)) { + if ((iter + 1 < numIterations_X) && detail::kmeans::adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + numTrainset, + trainsetLabels, + clusterSize, + (float)1.0 / 5, + device_memory, + handle.get_stream())) { iter -= (X - 1); + if (desc->similarity == CUANN_SIMILARITY_INNER) { + detail::utils::normalize_rows( + desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); + } } } } else { @@ -4014,18 +3876,21 @@ void cuannIvfPqBuildIndex( clusterCentersTemp, clusterSize, true); - if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - trainsetLabels, - desc->similarity, - clusterSize, - (float)1.0 / 4, - nullptr)) { + if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + numTrainset, + trainsetLabels, + clusterSize, + (float)1.0 / 4, + device_memory, + handle.get_stream())) { iter -= 1; + if (desc->similarity == CUANN_SIMILARITY_INNER) { + detail::utils::normalize_rows( + desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); + } } } } @@ -4235,17 +4100,16 @@ void cuannIvfPqBuildIndex( pqCentersTemp[devId], pqClusterSize[devId], true); - if ((iter + 1 < numIterations_2) && _cuann_kmeans_adjust_centers(pqCentersEach[devId], - (1 << desc->bitPq), - desc->lenPq, - subTrainset[devId], - CUDA_R_32F, - numTrainset, - subTrainsetLabels[devId], - CUANN_SIMILARITY_L2, - pqClusterSize[devId], - (float)1.0 / 4, - wsKAC[devId])) { + if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(pqCentersEach[devId], + (1 << desc->bitPq), + desc->lenPq, + subTrainset[devId], + numTrainset, + subTrainsetLabels[devId], + pqClusterSize[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { iter -= 1; } } @@ -4273,26 +4137,25 @@ void cuannIvfPqBuildIndex( // // Compute PQ code for whole dataset // - _cuann_compute_PQ_code(handle, - desc->numDataset, - desc->dimDataset, - desc->dimRotDataset, - desc->dimPq, - desc->lenPq, - desc->bitPq, - desc->numClusters, - dtype, - desc->typePqCenter, - maxClusterSize, - clusterCenters, - rotationMatrix, - dataset, - originalNumbers, - clusterSize, - indexPtr, - pqCenters, - numIterations, - pqDataset); + _cuann_compute_PQ_code(handle, + desc->numDataset, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq, + desc->lenPq, + desc->bitPq, + desc->numClusters, + desc->typePqCenter, + maxClusterSize, + clusterCenters, + rotationMatrix, + dataset, + originalNumbers, + clusterSize, + indexPtr, + pqCenters, + numIterations, + pqDataset); RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); // @@ -4471,11 +4334,12 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, } // cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex -inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +template +void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const handle_t& handle, const char* oldIndexFileName, const char* newIndexFileName, - const void* newVectors, /* [numNewVectors, dimDataset] */ + const T* newVectors, /* [numNewVectors, dimDataset] */ uint32_t numNewVectors) { switch (detail::utils::check_pointer_residency(newVectors)) { @@ -4619,26 +4483,25 @@ inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( uint8_t* pqDataset; // [numNewVectors, dimPq * bitPq / 8] RAFT_CUDA_TRY(cudaMallocManaged( &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8)); - _cuann_compute_PQ_code(handle, - numNewVectors, - oldDesc->dimDataset, - oldDesc->dimRotDataset, - oldDesc->dimPq, - oldDesc->lenPq, - oldDesc->bitPq, - oldDesc->numClusters, - dtype, - oldDesc->typePqCenter, - maxClusterSize, - clusterCenters, - oldRotationMatrix, - newVectors, - originalNumbers, - clusterSize, - indexPtr, - oldPqCenters, - 0, - pqDataset); + _cuann_compute_PQ_code(handle, + numNewVectors, + oldDesc->dimDataset, + oldDesc->dimRotDataset, + oldDesc->dimPq, + oldDesc->lenPq, + oldDesc->bitPq, + oldDesc->numClusters, + oldDesc->typePqCenter, + maxClusterSize, + clusterCenters, + oldRotationMatrix, + newVectors, + originalNumbers, + clusterSize, + indexPtr, + oldPqCenters, + 0, + pqDataset); RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); // diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index fa702a5bd5..d848149562 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -207,6 +207,7 @@ class IvfPqTest : public ::testing::TestWithParam { : ivf_pq::CUANN_SIMILARITY_L2; // Specify whether PQ codebooks are created per subspace or per cluster. ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; + // ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_CLUSTER; ivf_pq::cuannIvfPqSetIndexParameters( cuann_desc.get(), n_clusters, /* Number of clusters */ @@ -223,20 +224,11 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_buffer ivf_pq_index_buf_managed(ivf_pq_index_size, stream_, &managed_memory); // Build index - cudaDataType_t dtype; - if constexpr (std::is_same_v) { - dtype = CUDA_R_8U; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_8I; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_32F; - } ivf_pq::cuannIvfPqBuildIndex( handle_, cuann_desc.get(), - database.data(), // dataset - database.data(), // ?kmeans? trainset - dtype, + database.data(), // dataset + database.data(), // ?kmeans? trainset uint32_t(ps.num_db_vecs), // size of the trainset (I guess for kmeans) numIterations, randomRotation, @@ -293,7 +285,6 @@ class IvfPqTest : public ::testing::TestWithParam { cuann_desc.get(), ivf_pq_index_buf_managed.data(), search_queries.data(), - dtype, ps.num_queries, indices_ivf_pq_dev.data(), distances_ivf_pq_dev.data(), From 67bffa44f9356753f83b426ee03288fb29168e72 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 11 Aug 2022 15:25:48 +0200 Subject: [PATCH 015/140] Put index as a member of descriptor --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 304 ++++++++++-------------- cpp/test/spatial/ann_ivf_pq.cu | 37 +-- 2 files changed, 128 insertions(+), 213 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 4621a88663..98189485c5 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -158,8 +158,42 @@ struct cuannIvfPqDescriptor { size_t sizeCubWorkspace; uint32_t _numClustersSize0; // (*) urgent WA, need to be fixed uint32_t preferredThreadBlockSize; + void* index_ptr; }; -typedef struct cuannIvfPqDescriptor* cuannIvfPqDescriptor_t; +using cuannIvfPqDescriptor_t = + std::unique_ptr>; + +cuannIvfPqDescriptor_t cuannIvfPqCreateDescriptor() +{ + return cuannIvfPqDescriptor_t{[]() { + auto desc = new cuannIvfPqDescriptor{}; + desc->numClusters = 0; + desc->numDataset = 0; + desc->dimDataset = 0; + desc->dimDatasetExt = 0; + desc->dimRotDataset = 0; + desc->dimPq = 0; + desc->bitPq = 0; + desc->numProbes = 0; + desc->topK = 0; + desc->maxQueries = 0; + desc->maxBatchSize = 0; + desc->maxSamples = 0; + desc->inclusiveSumSortedClusterSize = NULL; + desc->sqsumClusters = NULL; + desc->index_ptr = NULL; + return desc; + }(), + [](cuannIvfPqDescriptor* desc) { + if (desc->sqsumClusters != NULL) { + RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->sqsumClusters)); + } + if (desc->index_ptr != NULL) { + RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); + } + delete desc; + }}; +} // header of index struct cuannIvfPqIndexHeader { @@ -2262,12 +2296,12 @@ inline void _cuann_find_topk(const handle_t& handle, */ // -inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc); +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc); // search template inline void ivfpq_search(const handle_t& handle, - cuannIvfPqDescriptor_t desc, + cuannIvfPqDescriptor_t& desc, uint32_t numQueries, const float* clusterCenters, // [numDataset, dimDataset] const float* pqCenters, // [dimPq, 256, lenPq] @@ -2289,8 +2323,8 @@ inline void ivfpq_encode(uint32_t numDataset, ); // -bool manage_local_topk(cuannIvfPqDescriptor_t desc); -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads); +bool manage_local_topk(cuannIvfPqDescriptor_t& desc); +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads); // __global__ void ivfpq_init_topkScores(float* topkScores, // [num,] @@ -2492,7 +2526,7 @@ __global__ void ivfpq_make_outputs(uint32_t numProbes, } // -inline bool manage_local_topk(cuannIvfPqDescriptor_t desc) +inline bool manage_local_topk(cuannIvfPqDescriptor_t& desc) { int depth = (desc->topK + 31) / 32; if (depth > 4) { return false; } @@ -2502,7 +2536,7 @@ inline bool manage_local_topk(cuannIvfPqDescriptor_t desc) } // -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThreads) +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads) { if (manage_local_topk(desc)) { int topk_32 = (desc->topK + 31) / 32; @@ -2512,7 +2546,7 @@ inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t desc, int numThrea } // return workspace size -inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t desc) +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc) { size_t size = 0; // clusterLabelsOut [maxBatchSize, numProbes] @@ -2744,11 +2778,8 @@ template __global__ void ivfpq_make_outputs( * */ -inline void cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc); -inline void cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc); - inline void cuannIvfPqSetIndexParameters( - cuannIvfPqDescriptor_t desc, + cuannIvfPqDescriptor_t& desc, const uint32_t numClusters, /* Number of clusters */ const uint32_t numDataset, /* Number of dataset entries */ const uint32_t dimDataset, /* Dimension of each entry */ @@ -2757,7 +2788,7 @@ inline void cuannIvfPqSetIndexParameters( const cuannSimilarity_t similarity, const cuannPqCenter_t typePqCenter); -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, uint32_t* numClusters, uint32_t* numDataset, uint32_t* dimDataset, @@ -2766,84 +2797,16 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, cuannSimilarity_t* similarity, cuannPqCenter_t* typePqCenter); -inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size /* bytes of dataset index */); -inline void cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - const char* fileName); - -inline void cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - void** index, - const char* fileName); - -inline void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( - const handle_t& handle, - const char* oldIndexFileName, - const char* newIndexFileName, - const void* newVectors, /* [numVectorsToAdd, dimDataset] */ - uint32_t numNewVectors); - -inline void cuannIvfPqSetSearchParameters( - cuannIvfPqDescriptor_t desc, - const uint32_t numProbes, /* Number of clusters to probe */ - const uint32_t topK); /* Number of search results */ - -inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize); - -inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, - uint32_t* numProbes, - uint32_t* topK); - -inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize); - -inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, - uint32_t numQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize); - -inline void cuannPostprocessingRefine(uint32_t numDataset, - uint32_t numQueries, - uint32_t dimDataset, - const void* dataset, /* [numDataset, dimDataset] */ - const void* queries, /* [numQueries, dimDataset] */ - cudaDataType_t dtype, - cuannSimilarity_t similarity, - uint32_t topK, - const uint64_t* neighbors, /* [numQueries, topK] */ - uint32_t refinedTopK, - uint64_t* refinedNeighbors, /* [numQueries, refinedTopK] */ - float* refinedDistances /* [numQueries, refinedTopK] */ -); - -inline void cuannPostprocessingMerge( - uint32_t numSplit, - uint32_t numQueries, - uint32_t topK, - const uint32_t* eachNumDataset, /* [numSplit] */ - const uint64_t* eachNeighbors, /* [numSplit, numQueries, topK] */ - const float* eachDistances, /* [numSplit, numQueries, topK] */ - uint64_t* neighbors, /* [numQueries, topK] */ - float* distances /* [numQueries, topK] */ -); - -inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t& desc) { // [numClusters, dimDatasetExt] return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimDatasetExt); } -inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t& desc) { size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { @@ -2855,38 +2818,37 @@ inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t desc) } } -inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t& desc) { // [numDataset, dimPq * bitPq / 8] return Pow2<128>::roundUp(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); } -inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t& desc) { // [numDataset,] return Pow2<128>::roundUp(sizeof(uint32_t) * desc->numDataset); } -inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t& desc) { // [numClusters + 1,] return Pow2<128>::roundUp(sizeof(uint32_t) * (desc->numClusters + 1)); } -inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t& desc) { // [dimDataset, dimRotDataset] return Pow2<128>::roundUp(sizeof(float) * desc->dimDataset * desc->dimRotDataset); } -inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t desc) +inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t& desc) { // [numClusters, dimRotDataset] return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimRotDataset); } -inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, - const void* index, +inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, struct cuannIvfPqIndexHeader** header, float** clusterCenters, // [numClusters, dimDatasetExt] float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or @@ -2898,7 +2860,7 @@ inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t desc, float** clusterRotCenters // [numClusters, dimRotDataset] ) { - *header = (struct cuannIvfPqIndexHeader*)index; + *header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); *clusterCenters = (float*)((uint8_t*)(*header) + sizeof(struct cuannIvfPqIndexHeader)); *pqCenters = (float*)((uint8_t*)(*clusterCenters) + _cuann_getIndexSize_clusterCenters(desc)); *pqDataset = (uint8_t*)((uint8_t*)(*pqCenters) + _cuann_getIndexSize_pqCenters(desc)); @@ -2944,7 +2906,7 @@ inline void _cuann_get_random_norm_vector(int len, float* vector) } inline void _cuann_get_inclusiveSumSortedClusterSize( - cuannIvfPqDescriptor_t desc, + cuannIvfPqDescriptor_t& desc, const uint32_t* indexPtr, // [numClusters + 1] float* clusterCenters, // [numClusters, dimDatasetExt] uint32_t** output // [numClusters] @@ -2971,7 +2933,7 @@ inline void _cuann_get_inclusiveSumSortedClusterSize( RAFT_EXPECTS((*output)[desc->numClusters - 1] == desc->numDataset, "cluster sizes do not add up"); } -inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t desc, +inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t& desc, const float* clusterCenters, // [numClusters, dimDataset,] float** output // [numClusters,] ) @@ -3409,37 +3371,8 @@ void _cuann_compute_PQ_code(const handle_t& handle, } } -// cuannIvfPqCreateDescriptor -inline void cuannIvfPqCreateDescriptor(cuannIvfPqDescriptor_t* desc) -{ - *desc = (cuannIvfPqDescriptor_t)malloc(sizeof(struct cuannIvfPqDescriptor)); - RAFT_EXPECTS(*desc != nullptr, "cuann allocation failed"); - (*desc)->numClusters = 0; - (*desc)->numDataset = 0; - (*desc)->dimDataset = 0; - (*desc)->dimDatasetExt = 0; - (*desc)->dimRotDataset = 0; - (*desc)->dimPq = 0; - (*desc)->bitPq = 0; - (*desc)->numProbes = 0; - (*desc)->topK = 0; - (*desc)->maxQueries = 0; - (*desc)->maxBatchSize = 0; - (*desc)->maxSamples = 0; - (*desc)->inclusiveSumSortedClusterSize = NULL; - (*desc)->sqsumClusters = NULL; -} - -// cuannIvfPqDestroyDescriptor -inline void cuannIvfPqDestroyDescriptor(cuannIvfPqDescriptor_t desc) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - if (desc->sqsumClusters != NULL) { RAFT_CUDA_TRY(cudaFree(desc->sqsumClusters)); } - free(desc); -} - // cuannIvfPqSetIndexParameters -inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, const uint32_t numClusters, const uint32_t numDataset, const uint32_t dimDataset, @@ -3486,7 +3419,7 @@ inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetIndexParameters -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, uint32_t* numClusters, uint32_t* numDataset, uint32_t* dimDataset, @@ -3507,7 +3440,7 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetIndexSize -inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); @@ -3525,14 +3458,13 @@ inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t desc, size_t* size) template void cuannIvfPqBuildIndex( const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const T* dataset, /* [numDataset, dimDataset] */ - const T* trainset, /* [numTrainset, dimDataset] */ - uint32_t numTrainset, /* Number of train-set entries */ - uint32_t numIterations, /* Number of iterations to train kmeans */ - bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ - bool hierarchicalClustering, /* If true, do kmeans training hierarchically */ - void* index /* database index to build */) + cuannIvfPqDescriptor_t& desc, + const T* dataset, /* [numDataset, dimDataset] */ + const T* trainset, /* [numTrainset, dimDataset] */ + uint32_t numTrainset, /* Number of train-set entries */ + uint32_t numIterations, /* Number of iterations to train kmeans */ + bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ + bool hierarchicalClustering /* If true, do kmeans training hierarchically */) { int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); @@ -3572,6 +3504,11 @@ void cuannIvfPqBuildIndex( default: RAFT_FAIL("both dataset and trainsed must be accessible from the host."); } + if (desc->index_ptr != NULL) { RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); } + size_t index_size; + ivf_pq::cuannIvfPqGetIndexSize(desc, &index_size); + RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), index_size)); + struct cuannIvfPqIndexHeader* header; float* clusterCenters; // [numClusters, dimDataset] float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or @@ -3582,7 +3519,6 @@ void cuannIvfPqBuildIndex( float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, - index, &header, &clusterCenters, &pqCenters, @@ -4213,8 +4149,7 @@ void cuannIvfPqBuildIndex( // cuannIvfPqSaveIndex inline void cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, + cuannIvfPqDescriptor_t& desc, const char* fileName) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); @@ -4223,9 +4158,9 @@ inline void cuannIvfPqSaveIndex(const handle_t& handle, FILE* fp = fopen(fileName, "w"); RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)index; + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); RAFT_LOG_DEBUG("indexSize: %lu\n", header->indexSize); - if (fwrite(index, 1, header->indexSize, fp) != header->indexSize) { + if (fwrite(desc->index_ptr, 1, header->indexSize, fp) != header->indexSize) { RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); } fclose(fp); @@ -4235,8 +4170,7 @@ inline void cuannIvfPqSaveIndex(const handle_t& handle, // cuannIvfPqLoadIndex inline void cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - void** index, + cuannIvfPqDescriptor_t& desc, const char* fileName) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); @@ -4246,20 +4180,22 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, FILE* fp = fopen(fileName, "r"); RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); + if (desc->index_ptr != NULL) { RAFT_CUDA_TRY(cudaFree(desc->index_ptr)); } size_t indexSize; fread(&indexSize, sizeof(size_t), 1, fp); RAFT_LOG_DEBUG("indexSize: %lu\n", indexSize); - RAFT_CUDA_TRY(cudaMallocManaged(index, indexSize)); + RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), indexSize)); fseek(fp, 0, SEEK_SET); - if (fread(*index, 1, indexSize, fp) != indexSize) { + if (fread(desc->index_ptr, 1, indexSize, fp) != indexSize) { RAFT_FAIL("(%s) failed to load index to from file (%s)\n", __func__, fileName); } fclose(fp); - RAFT_CUDA_TRY(cudaMemAdvise(index, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); + RAFT_CUDA_TRY( + cudaMemAdvise(desc->index_ptr, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); } - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(*index); + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); desc->numClusters = header->numClusters; desc->numDataset = header->numDataset; desc->dimDataset = header->dimDataset; @@ -4283,7 +4219,6 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, - *index, &header, &clusterCenters, &pqCenters, @@ -4333,14 +4268,12 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, _cuann_set_device(orgDevId); } -// cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex template -void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( +auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const handle_t& handle, - const char* oldIndexFileName, - const char* newIndexFileName, + cuannIvfPqDescriptor_t& oldDesc, const T* newVectors, /* [numNewVectors, dimDataset] */ - uint32_t numNewVectors) + uint32_t numNewVectors) -> cuannIvfPqDescriptor_t { switch (detail::utils::check_pointer_residency(newVectors)) { case detail::utils::pointer_residency::host_only: @@ -4350,14 +4283,28 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( int cuannDevId = handle.get_device(); int callerDevId = _cuann_set_device(cuannDevId); - // - // Load old index - // - cuannIvfPqDescriptor_t oldDesc; - cuannIvfPqCreateDescriptor(&oldDesc); - void* oldIndex; - cuannIvfPqLoadIndex(handle, oldDesc, &oldIndex, oldIndexFileName); cudaDataType_t dtype = oldDesc->dtypeDataset; + if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_32F, + "The old index type (%d) doesn't much CUDA_R_32F required by the template instantiation", + dtype); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_8U, + "The old index type (%d) doesn't much CUDA_R_8U required by the template instantiation", + dtype); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_8I, + "The old index type (%d) doesn't much CUDA_R_8I required by the template instantiation", + dtype); + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } + char dtypeString[64]; _cuann_get_dtype_string(dtype, dtypeString); RAFT_LOG_DEBUG("dtype: %s", dtypeString); @@ -4372,7 +4319,6 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( float* oldRotationMatrix; // [dimDataset, dimRotDataset] float* oldClusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(oldDesc, - oldIndex, &oldHeader, &oldClusterCenters, &oldPqCenters, @@ -4507,10 +4453,10 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // // Create descriptor for new index // - cuannIvfPqDescriptor_t newDesc; - cuannIvfPqCreateDescriptor(&newDesc); - memcpy(newDesc, oldDesc, sizeof(struct cuannIvfPqDescriptor)); + auto newDesc = cuannIvfPqCreateDescriptor(); + memcpy(newDesc.get(), oldDesc.get(), sizeof(struct cuannIvfPqDescriptor)); newDesc->numDataset += numNewVectors; + newDesc->index_ptr = nullptr; RAFT_LOG_DEBUG("numDataset: %u -> %u", oldDesc->numDataset, newDesc->numDataset); // @@ -4519,8 +4465,8 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( size_t newIndexSize; cuannIvfPqGetIndexSize(newDesc, &newIndexSize); RAFT_LOG_DEBUG("indexSize: %lu -> %lu", oldHeader->indexSize, newIndexSize); - void* newIndex = malloc(newIndexSize); - memset(newIndex, 0, newIndexSize); + RAFT_CUDA_TRY(cudaMallocManaged(&(newDesc->index_ptr), newIndexSize)); + memset(newDesc->index_ptr, 0, newIndexSize); struct cuannIvfPqIndexHeader* newHeader; float* newClusterCenters; // [numClusters, dimDatasetExt] float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or @@ -4531,7 +4477,6 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( float* newRotationMatrix; // [dimDataset, dimRotDataset] float* newClusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(newDesc, - newIndex, &newHeader, &newClusterCenters, &newPqCenters, @@ -4607,7 +4552,6 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // // Save new index // - cuannIvfPqSaveIndex(handle, newDesc, newIndex, newIndexFileName); if (newHeader->numDatasetAdded * 2 >= newHeader->numDataset) { RAFT_LOG_INFO( "The total number of vectors in the new index" @@ -4618,27 +4562,20 @@ void cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( newHeader->numDatasetAdded); } - // - // Finalize - // - cuannIvfPqDestroyDescriptor(oldDesc); - cuannIvfPqDestroyDescriptor(newDesc); - free(originalNumbers); free(indexPtr); - free(newIndex); RAFT_CUDA_TRY(cudaFree(pqDataset)); RAFT_CUDA_TRY(cudaFree(clusterSize)); RAFT_CUDA_TRY(cudaFree(newVectorLabels)); RAFT_CUDA_TRY(cudaFree(clusterCenters)); - RAFT_CUDA_TRY(cudaFree(oldIndex)); _cuann_set_device(callerDevId); + return newDesc; } // cuannIvfPqSetSearchParameters -inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t& desc, const uint32_t numProbes, const uint32_t topK) { @@ -4678,7 +4615,7 @@ inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqSetSearchParameters -inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, cudaDataType_t internalDistanceDtype, cudaDataType_t smemLutDtype, const uint32_t preferredThreadBlockSize) @@ -4699,7 +4636,7 @@ inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetSearchParameters -inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t& desc, uint32_t* numProbes, uint32_t* topK) { @@ -4709,7 +4646,7 @@ inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t desc, } // cuannIvfPqGetSearchTuningParameters -inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, +inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, cudaDataType_t* internalDistanceDtype, cudaDataType_t* smemLutDtype, uint32_t* preferredThreadBlockSize) @@ -4722,8 +4659,7 @@ inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t desc, // cuannIvfPqSearch inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, + cuannIvfPqDescriptor_t& desc, uint32_t maxQueries, size_t maxWorkspaceSize, size_t* workspaceSize) @@ -4801,8 +4737,7 @@ inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, template void cuannIvfPqSearch(const handle_t& handle, - cuannIvfPqDescriptor_t desc, - const void* index, + cuannIvfPqDescriptor_t& desc, const T* queries, /* [numQueries, dimDataset], host or device pointer */ uint32_t numQueries, uint64_t* neighbors, /* [numQueries, topK], device pointer */ @@ -4835,7 +4770,6 @@ void cuannIvfPqSearch(const handle_t& handle, float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, - index, &header, &clusterCenters, &pqCenters, @@ -4871,7 +4805,7 @@ void cuannIvfPqSearch(const handle_t& handle, Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); void (*_ivfpq_search)(const handle_t&, - cuannIvfPqDescriptor_t, + cuannIvfPqDescriptor_t&, uint32_t, const float*, const float*, @@ -5876,7 +5810,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( // search template inline void ivfpq_search(const handle_t& handle, - cuannIvfPqDescriptor_t desc, + cuannIvfPqDescriptor_t& desc, uint32_t numQueries, const float* clusterCenters, // [numDataset, dimRotDataset] const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index d848149562..a4b637283b 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -157,15 +157,7 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_uvector indices_ivf_pq_dev(queries_size, stream_); { - std::unique_ptr> - cuann_desc{ - []() { - ivf_pq::cuannIvfPqDescriptor_t d; - ivf_pq::cuannIvfPqCreateDescriptor(&d); - return d; - }(), - [](ivf_pq::cuannIvfPqDescriptor_t d) { ivf_pq::cuannIvfPqDestroyDescriptor(d); }}; + auto cuann_desc = ivf_pq::cuannIvfPqCreateDescriptor(); // Number of kmeans clusters. // @@ -209,7 +201,7 @@ class IvfPqTest : public ::testing::TestWithParam { ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; // ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_CLUSTER; ivf_pq::cuannIvfPqSetIndexParameters( - cuann_desc.get(), + cuann_desc, n_clusters, /* Number of clusters */ uint32_t(ps.num_db_vecs), /* Number of dataset entries */ uint32_t(ps.dim), /* Dimension of each entry */ @@ -218,27 +210,21 @@ class IvfPqTest : public ::testing::TestWithParam { similarity, typePqCenter); - // Allocate memory for index - size_t ivf_pq_index_size; - ivf_pq::cuannIvfPqGetIndexSize(cuann_desc.get(), &ivf_pq_index_size); - rmm::device_buffer ivf_pq_index_buf_managed(ivf_pq_index_size, stream_, &managed_memory); - // Build index ivf_pq::cuannIvfPqBuildIndex( handle_, - cuann_desc.get(), + cuann_desc, database.data(), // dataset database.data(), // ?kmeans? trainset uint32_t(ps.num_db_vecs), // size of the trainset (I guess for kmeans) numIterations, randomRotation, - true, // hierarchialClustering: always true in raft - ivf_pq_index_buf_managed.data() // memory allocated for the index + true // hierarchialClustering: always true in raft ); handle_.sync_stream(stream_); // set search parameters - ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc.get(), ps.nprobe, ps.k); + ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc, ps.nprobe, ps.k); // Data type of LUT to be created dynamically at search time. // // The use of low-precision types reduces the amount of shared memory @@ -263,7 +249,7 @@ class IvfPqTest : public ::testing::TestWithParam { // uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 ivf_pq::cuannIvfPqSetSearchTuningParameters( - cuann_desc.get(), internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); + cuann_desc, internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); // Maximum number of query vectors to search at the same time. uint32_t batchSize = std::min(ps.num_queries, 32768); // Maximum device memory size that may be used as workspace at search time. @@ -272,18 +258,13 @@ class IvfPqTest : public ::testing::TestWithParam { // Allocate memory for index size_t ivf_pq_search_workspace_size; - ivf_pq::cuannIvfPqSearch_bufferSize(handle_, - cuann_desc.get(), - ivf_pq_index_buf_managed.data(), - batchSize, - maxSearchWorkspaceSize, - &ivf_pq_search_workspace_size); + ivf_pq::cuannIvfPqSearch_bufferSize( + handle_, cuann_desc, batchSize, maxSearchWorkspaceSize, &ivf_pq_search_workspace_size); rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); // finally, search! cuannIvfPqSearch(handle_, - cuann_desc.get(), - ivf_pq_index_buf_managed.data(), + cuann_desc, search_queries.data(), ps.num_queries, indices_ivf_pq_dev.data(), From f951c5236331fc73f648d46b076773f88b9065fe Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 12 Aug 2022 11:55:39 +0200 Subject: [PATCH 016/140] Add the index extending operation --- .../raft/spatial/knn/detail/ann_utils.cuh | 2 +- cpp/include/raft/spatial/knn/ivf_pq.cuh | 212 +++++++++--------- cpp/test/spatial/ann_ivf_pq.cu | 34 +-- 3 files changed, 134 insertions(+), 114 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 24075ace55..11013bd4d9 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -542,7 +542,7 @@ void copy_selected(uint32_t n_rows, uint32_t ld_dst, rmm::cuda_stream_view stream) { - switch (check_pointer_residency(src, dst)) { + switch (check_pointer_residency(src, dst, row_ids)) { case pointer_residency::host_and_device: case pointer_residency::device_only: { uint32_t block_dim = 128; diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 98189485c5..e59f14fc31 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -179,16 +179,19 @@ cuannIvfPqDescriptor_t cuannIvfPqCreateDescriptor() desc->maxQueries = 0; desc->maxBatchSize = 0; desc->maxSamples = 0; - desc->inclusiveSumSortedClusterSize = NULL; - desc->sqsumClusters = NULL; - desc->index_ptr = NULL; + desc->inclusiveSumSortedClusterSize = nullptr; + desc->sqsumClusters = nullptr; + desc->index_ptr = nullptr; return desc; }(), [](cuannIvfPqDescriptor* desc) { - if (desc->sqsumClusters != NULL) { + if (desc->inclusiveSumSortedClusterSize != nullptr) { + free(desc->inclusiveSumSortedClusterSize); + } + if (desc->sqsumClusters != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->sqsumClusters)); } - if (desc->index_ptr != NULL) { + if (desc->index_ptr != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); } delete desc; @@ -2307,7 +2310,7 @@ inline void ivfpq_search(const handle_t& handle, const float* pqCenters, // [dimPq, 256, lenPq] const uint8_t* pqDataset, // [numDataset, dimPq] const uint32_t* originalNumbers, // [numDataset] - const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* cluster_offsets, // [numClusters + 1] const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] const float* query, // [dimDataset] uint64_t* topKNeighbors, // [topK] @@ -2338,7 +2341,7 @@ __global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList); __global__ void ivfpq_make_chunk_index_ptr( uint32_t numProbes, uint32_t sizeBatch, - const uint32_t* indexPtr, // [numClusters + 1,] + const uint32_t* cluster_offsets, // [numClusters + 1,] const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] uint32_t* numSamples // [sizeBatch,] @@ -2397,7 +2400,7 @@ __device__ inline uint32_t thread_block_scan(uint32_t x, uint32_t* smem) __global__ void ivfpq_make_chunk_index_ptr( uint32_t numProbes, uint32_t sizeBatch, - const uint32_t* indexPtr, // [numClusters + 1,] + const uint32_t* cluster_offsets, // [numClusters + 1,] const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] uint32_t* numSamples // [sizeBatch,] @@ -2418,7 +2421,7 @@ __global__ void ivfpq_make_chunk_index_ptr( uint32_t val = 0; if (i < numProbes) { uint32_t l = clusterLabelsToProbe[i]; - val = indexPtr[l + 1] - indexPtr[l]; + val = cluster_offsets[l + 1] - cluster_offsets[l]; } val = thread_block_scan(val, smem_temp); @@ -2855,7 +2858,7 @@ inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, // [numClusters, 1 << bitPq, lenPq] uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] uint32_t** originalNumbers, // [numDataset] - uint32_t** indexPtr, // [numClusters + 1] + uint32_t** cluster_offsets, // [numClusters + 1] float** rotationMatrix, // [dimDataset, dimRotDataset] float** clusterRotCenters // [numClusters, dimRotDataset] ) @@ -2865,20 +2868,21 @@ inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, *pqCenters = (float*)((uint8_t*)(*clusterCenters) + _cuann_getIndexSize_clusterCenters(desc)); *pqDataset = (uint8_t*)((uint8_t*)(*pqCenters) + _cuann_getIndexSize_pqCenters(desc)); *originalNumbers = (uint32_t*)((uint8_t*)(*pqDataset) + _cuann_getIndexSize_pqDataset(desc)); - *indexPtr = (uint32_t*)((uint8_t*)(*originalNumbers) + _cuann_getIndexSize_originalNumbers(desc)); - *rotationMatrix = (float*)((uint8_t*)(*indexPtr) + _cuann_getIndexSize_indexPtr(desc)); + *cluster_offsets = + (uint32_t*)((uint8_t*)(*originalNumbers) + _cuann_getIndexSize_originalNumbers(desc)); + *rotationMatrix = (float*)((uint8_t*)(*cluster_offsets) + _cuann_getIndexSize_indexPtr(desc)); *clusterRotCenters = (float*)((uint8_t*)(*rotationMatrix) + _cuann_getIndexSize_rotationMatrix(desc)); } __global__ void kern_get_cluster_size(uint32_t numClusters, - const uint32_t* indexPtr, // [numClusters + 1,] - uint32_t* clusterSize // [numClusters,] + const uint32_t* cluster_offsets, // [numClusters + 1,] + uint32_t* clusterSize // [numClusters,] ) { uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= numClusters) return; - clusterSize[i] = indexPtr[i + 1] - indexPtr[i]; + clusterSize[i] = cluster_offsets[i + 1] - cluster_offsets[i]; } template @@ -2907,16 +2911,17 @@ inline void _cuann_get_random_norm_vector(int len, float* vector) inline void _cuann_get_inclusiveSumSortedClusterSize( cuannIvfPqDescriptor_t& desc, - const uint32_t* indexPtr, // [numClusters + 1] - float* clusterCenters, // [numClusters, dimDatasetExt] - uint32_t** output // [numClusters] + const uint32_t* cluster_offsets, // [numClusters + 1] + float* clusterCenters, // [numClusters, dimDatasetExt] + uint32_t** output // [numClusters] ) { // [CPU] + if (*output != nullptr) { free(*output); } *output = (uint32_t*)malloc(sizeof(uint32_t) * desc->numClusters); desc->_numClustersSize0 = 0; for (uint32_t i = 0; i < desc->numClusters; i++) { - (*output)[i] = indexPtr[i + 1] - indexPtr[i]; + (*output)[i] = cluster_offsets[i + 1] - cluster_offsets[i]; if ((*output)[i] > 0) continue; desc->_numClustersSize0 += 1; @@ -3144,7 +3149,7 @@ void _cuann_compute_PQ_code(const handle_t& handle, const T* dataset, // [numDataset] const uint32_t* originalNumbers, // [numDataset] const uint32_t* clusterSize, // [numClusters] - const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* cluster_offsets, // [numClusters + 1] float* pqCenters, // [...] uint32_t numIterations, uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] @@ -3214,7 +3219,7 @@ void _cuann_compute_PQ_code(const handle_t& handle, detail::utils::copy_selected(clusterSize[l], dimDataset, dataset, - originalNumbers + indexPtr[l], + originalNumbers + cluster_offsets[l], dimDataset, resVectors[devId], dimDataset, @@ -3347,7 +3352,7 @@ void _cuann_compute_PQ_code(const handle_t& handle, // ivfpq_encode( clusterSize[l], clusterSize[l], dimPq, bitPq, subVectorLabels[devId], myPqDataset[devId]); - RAFT_CUDA_TRY(cudaMemcpy(pqDataset + ((uint64_t)indexPtr[l] * dimPq * bitPq / 8), + RAFT_CUDA_TRY(cudaMemcpy(pqDataset + ((uint64_t)cluster_offsets[l] * dimPq * bitPq / 8), myPqDataset[devId], sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, cudaMemcpyDeviceToHost)); @@ -3515,7 +3520,7 @@ void cuannIvfPqBuildIndex( // [numClusters, 1 << bitPq, lenPq] uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] uint32_t* originalNumbers; // [numDataset] - uint32_t* indexPtr; // [numClusters + 1] + uint32_t* cluster_offsets; // [numClusters + 1] float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, @@ -3524,7 +3529,7 @@ void cuannIvfPqBuildIndex( &pqCenters, &pqDataset, &originalNumbers, - &indexPtr, + &cluster_offsets, &rotationMatrix, &clusterRotCenters); @@ -3882,28 +3887,29 @@ void cuannIvfPqBuildIndex( handle.get_stream()); // - // Make indexPtr, originalNumbers and pqDataset + // Make cluster_offsets, originalNumbers and pqDataset // uint32_t maxClusterSize = 0; - // indexPtr - indexPtr[0] = 0; + // cluster_offsets + cluster_offsets[0] = 0; for (uint32_t l = 0; l < desc->numClusters; l++) { - indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; + cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; if (maxClusterSize < clusterSize[l]) { maxClusterSize = clusterSize[l]; } } - RAFT_EXPECTS(indexPtr[desc->numClusters] == desc->numDataset, "Cluster sizes do not add up"); + RAFT_EXPECTS(cluster_offsets[desc->numClusters] == desc->numDataset, + "Cluster sizes do not add up"); desc->maxClusterSize = maxClusterSize; // originalNumbers for (uint32_t i = 0; i < desc->numDataset; i++) { - uint32_t l = datasetLabels[i]; - originalNumbers[indexPtr[l]] = i; - indexPtr[l] += 1; + uint32_t l = datasetLabels[i]; + originalNumbers[cluster_offsets[l]] = i; + cluster_offsets[l] += 1; } - // Recover indexPtr + // Recover cluster_offsets for (uint32_t l = 0; l < desc->numClusters; l++) { - indexPtr[l] -= clusterSize[l]; + cluster_offsets[l] -= clusterSize[l]; } // [numDevices][1 << bitPq, lenPq] @@ -4088,7 +4094,7 @@ void cuannIvfPqBuildIndex( dataset, originalNumbers, clusterSize, - indexPtr, + cluster_offsets, pqCenters, numIterations, pqDataset); @@ -4096,7 +4102,7 @@ void cuannIvfPqBuildIndex( // _cuann_get_inclusiveSumSortedClusterSize( - desc, indexPtr, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); _cuann_get_sqsumClusters(desc, clusterCenters, &(desc->sqsumClusters)); { @@ -4215,7 +4221,7 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, // [numClusters, 1 << bitPq, lenPq] uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] uint32_t* originalNumbers; // [numDataset] - uint32_t* indexPtr; // [numClusters + 1] + uint32_t* cluster_offsets; // [numClusters + 1] float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, @@ -4224,13 +4230,13 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, &pqCenters, &pqDataset, &originalNumbers, - &indexPtr, + &cluster_offsets, &rotationMatrix, &clusterRotCenters); // _cuann_get_inclusiveSumSortedClusterSize( - desc, indexPtr, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); size_t size; // pqDataset @@ -4251,9 +4257,9 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, // originalNumbers size = sizeof(uint32_t) * desc->numDataset; RAFT_CUDA_TRY(cudaMemPrefetchAsync(originalNumbers, size, handle.get_device())); - // indexPtr + // cluster_offsets size = sizeof(uint32_t) * (desc->numClusters + 1); - RAFT_CUDA_TRY(cudaMemPrefetchAsync(indexPtr, size, handle.get_device())); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(cluster_offsets, size, handle.get_device())); // rotationMatrix if (rotationMatrix != NULL) { size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; @@ -4310,21 +4316,21 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( RAFT_LOG_DEBUG("dtype: %s", dtypeString); RAFT_LOG_DEBUG("dimDataset: %u", oldDesc->dimDataset); struct cuannIvfPqIndexHeader* oldHeader; - float* oldClusterCenters; // [numClusters, dimDatasetExt] - float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* oldPqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* oldOriginalNumbers; // [numDataset] - uint32_t* oldIndexPtr; // [numClusters + 1] - float* oldRotationMatrix; // [dimDataset, dimRotDataset] - float* oldClusterRotCenters; // [numClusters, dimRotDataset] + float* oldClusterCenters; // [numClusters, dimDatasetExt] + float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* oldPqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* oldOriginalNumbers; // [numDataset] + uint32_t* old_cluster_offsets; // [numClusters + 1] + float* oldRotationMatrix; // [dimDataset, dimRotDataset] + float* oldClusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(oldDesc, &oldHeader, &oldClusterCenters, &oldPqCenters, &oldPqDataset, &oldOriginalNumbers, - &oldIndexPtr, + &old_cluster_offsets, &oldRotationMatrix, &oldClusterRotCenters); @@ -4398,29 +4404,30 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( #endif // - // Make indexPtr, originalNumbers + // Make cluster_offsets, originalNumbers // uint32_t maxClusterSize = 0; - uint32_t* indexPtr; // [numClusters + 1] + uint32_t* cluster_offsets; // [numClusters + 1] uint32_t* originalNumbers; // [numNewVectors] - indexPtr = (uint32_t*)malloc(sizeof(uint32_t) * (oldDesc->numClusters + 1)); + cluster_offsets = (uint32_t*)malloc(sizeof(uint32_t) * (oldDesc->numClusters + 1)); originalNumbers = (uint32_t*)malloc(sizeof(uint32_t) * numNewVectors); - // indexPtr - indexPtr[0] = 0; + // cluster_offsets + cluster_offsets[0] = 0; for (uint32_t l = 0; l < oldDesc->numClusters; l++) { - indexPtr[l + 1] = indexPtr[l] + clusterSize[l]; - maxClusterSize = max(maxClusterSize, clusterSize[l]); + cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; + maxClusterSize = max(maxClusterSize, clusterSize[l]); } - RAFT_EXPECTS(indexPtr[oldDesc->numClusters] == numNewVectors, "cluster sizes do not add up."); + RAFT_EXPECTS(cluster_offsets[oldDesc->numClusters] == numNewVectors, + "cluster sizes do not add up."); // originalNumbers for (uint32_t i = 0; i < numNewVectors; i++) { - uint32_t l = newVectorLabels[i]; - originalNumbers[indexPtr[l]] = i; - indexPtr[l] += 1; + uint32_t l = newVectorLabels[i]; + originalNumbers[cluster_offsets[l]] = i; + cluster_offsets[l] += 1; } - // Recover indexPtr + // Recover cluster_offsets for (uint32_t l = 0; l < oldDesc->numClusters; l++) { - indexPtr[l] -= clusterSize[l]; + cluster_offsets[l] -= clusterSize[l]; } // @@ -4444,7 +4451,7 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( newVectors, originalNumbers, clusterSize, - indexPtr, + cluster_offsets, oldPqCenters, 0, pqDataset); @@ -4456,7 +4463,9 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( auto newDesc = cuannIvfPqCreateDescriptor(); memcpy(newDesc.get(), oldDesc.get(), sizeof(struct cuannIvfPqDescriptor)); newDesc->numDataset += numNewVectors; - newDesc->index_ptr = nullptr; + newDesc->inclusiveSumSortedClusterSize = nullptr; + newDesc->sqsumClusters = nullptr; + newDesc->index_ptr = nullptr; RAFT_LOG_DEBUG("numDataset: %u -> %u", oldDesc->numDataset, newDesc->numDataset); // @@ -4468,21 +4477,21 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( RAFT_CUDA_TRY(cudaMallocManaged(&(newDesc->index_ptr), newIndexSize)); memset(newDesc->index_ptr, 0, newIndexSize); struct cuannIvfPqIndexHeader* newHeader; - float* newClusterCenters; // [numClusters, dimDatasetExt] - float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* newPqDataset; // [numDataset, dimPq * bitPq / 8] *** - uint32_t* newOriginalNumbers; // [numDataset] *** - uint32_t* newIndexPtr; // [numClusters + 1] *** - float* newRotationMatrix; // [dimDataset, dimRotDataset] - float* newClusterRotCenters; // [numClusters, dimRotDataset] + float* newClusterCenters; // [numClusters, dimDatasetExt] + float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* newPqDataset; // [numDataset, dimPq * bitPq / 8] *** + uint32_t* newOriginalNumbers; // [numDataset] *** + uint32_t* new_cluster_offsets; // [numClusters + 1] *** + float* newRotationMatrix; // [dimDataset, dimRotDataset] + float* newClusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(newDesc, &newHeader, &newClusterCenters, &newPqCenters, &newPqDataset, &newOriginalNumbers, - &newIndexPtr, + &new_cluster_offsets, &newRotationMatrix, &newClusterRotCenters); @@ -4503,14 +4512,14 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( newClusterRotCenters, oldClusterRotCenters, _cuann_getIndexSize_clusterRotCenters(oldDesc)); // - // Make newIndexPtr + // Make new_cluster_offsets // - maxClusterSize = 0; - newIndexPtr[0] = 0; + maxClusterSize = 0; + new_cluster_offsets[0] = 0; for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; - newIndexPtr[l + 1] = newIndexPtr[l]; - newIndexPtr[l + 1] += oldClusterSize + clusterSize[l]; + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + new_cluster_offsets[l + 1] = new_cluster_offsets[l]; + new_cluster_offsets[l + 1] += oldClusterSize + clusterSize[l]; maxClusterSize = max(maxClusterSize, oldClusterSize + clusterSize[l]); } { @@ -4526,12 +4535,12 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( originalNumbers[i] += oldDesc->numDataset; } for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; - memcpy(newOriginalNumbers + newIndexPtr[l], - oldOriginalNumbers + oldIndexPtr[l], + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + memcpy(newOriginalNumbers + new_cluster_offsets[l], + oldOriginalNumbers + old_cluster_offsets[l], sizeof(uint32_t) * oldClusterSize); - memcpy(newOriginalNumbers + newIndexPtr[l] + oldClusterSize, - originalNumbers + indexPtr[l], + memcpy(newOriginalNumbers + new_cluster_offsets[l] + oldClusterSize, + originalNumbers + cluster_offsets[l], sizeof(uint32_t) * clusterSize[l]); } @@ -4540,19 +4549,22 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( // size_t unitPqDataset = newDesc->dimPq * newDesc->bitPq / 8; for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = oldIndexPtr[l + 1] - oldIndexPtr[l]; - memcpy(newPqDataset + unitPqDataset * newIndexPtr[l], - oldPqDataset + unitPqDataset * oldIndexPtr[l], + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + memcpy(newPqDataset + unitPqDataset * new_cluster_offsets[l], + oldPqDataset + unitPqDataset * old_cluster_offsets[l], sizeof(uint8_t) * unitPqDataset * oldClusterSize); - memcpy(newPqDataset + unitPqDataset * (newIndexPtr[l] + oldClusterSize), - pqDataset + unitPqDataset * indexPtr[l], + memcpy(newPqDataset + unitPqDataset * (new_cluster_offsets[l] + oldClusterSize), + pqDataset + unitPqDataset * cluster_offsets[l], sizeof(uint8_t) * unitPqDataset * clusterSize[l]); } + _cuann_get_inclusiveSumSortedClusterSize( + newDesc, new_cluster_offsets, newClusterCenters, &(newDesc->inclusiveSumSortedClusterSize)); + // - // Save new index + // Done // - if (newHeader->numDatasetAdded * 2 >= newHeader->numDataset) { + if (newHeader->numDatasetAdded * 2 > newHeader->numDataset) { RAFT_LOG_INFO( "The total number of vectors in the new index" " is now more than twice the initial number of vectors." @@ -4563,7 +4575,7 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( } free(originalNumbers); - free(indexPtr); + free(cluster_offsets); RAFT_CUDA_TRY(cudaFree(pqDataset)); RAFT_CUDA_TRY(cudaFree(clusterSize)); @@ -4766,7 +4778,7 @@ void cuannIvfPqSearch(const handle_t& handle, // [numClusters, 1 << bitPq, lenPq] uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] uint32_t* originalNumbers; // [numDataset] - uint32_t* indexPtr; // [numClusters + 1] + uint32_t* cluster_offsets; // [numClusters + 1] float* rotationMatrix; // [dimDataset, dimRotDataset] float* clusterRotCenters; // [numClusters, dimRotDataset] _cuann_get_index_pointers(desc, @@ -4775,7 +4787,7 @@ void cuannIvfPqSearch(const handle_t& handle, &pqCenters, &pqDataset, &originalNumbers, - &indexPtr, + &cluster_offsets, &rotationMatrix, &clusterRotCenters); // @@ -4981,7 +4993,7 @@ void cuannIvfPqSearch(const handle_t& handle, pqCenters, pqDataset, originalNumbers, - indexPtr, + cluster_offsets, clusterLabelsToProbe + ((uint64_t)(desc->numProbes) * j), rotQueries + ((uint64_t)(desc->dimRotDataset) * j), neighbors + ((uint64_t)(desc->topK) * (i + j)), @@ -5816,7 +5828,7 @@ inline void ivfpq_search(const handle_t& handle, const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] const uint32_t* originalNumbers, // [numDataset] - const uint32_t* indexPtr, // [numClusters + 1] + const uint32_t* cluster_offsets, // [numClusters + 1] const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] const float* query, // [numQueries, dimRotDataset] uint64_t* topkNeighbors, // [numQueries, topK] @@ -5896,7 +5908,7 @@ inline void ivfpq_search(const handle_t& handle, dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE dim3 mcBlocks(numQueries, 1, 1); ivfpq_make_chunk_index_ptr<<>>( - desc->numProbes, numQueries, indexPtr, clusterLabelsToProbe, chunkIndexPtr, numSamples); + desc->numProbes, numQueries, cluster_offsets, clusterLabelsToProbe, chunkIndexPtr, numSamples); #ifdef CUANN_DEBUG handle.sync_stream(); #endif @@ -6091,7 +6103,7 @@ inline void ivfpq_search(const handle_t& handle, clusterCenters, pqCenters, pqDataset, - indexPtr, + cluster_offsets, clusterLabelsToProbe, chunkIndexPtr, query, @@ -6136,7 +6148,7 @@ inline void ivfpq_search(const handle_t& handle, desc->topK, desc->maxSamples, numQueries, - indexPtr, + cluster_offsets, originalNumbers, clusterLabelsToProbe, chunkIndexPtr, diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index a4b637283b..b628a708df 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -157,7 +157,12 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_uvector indices_ivf_pq_dev(queries_size, stream_); { - auto cuann_desc = ivf_pq::cuannIvfPqCreateDescriptor(); + auto size_1 = uint32_t(ps.num_db_vecs) / 2; + auto size_2 = uint32_t(ps.num_db_vecs) - size_1; + auto vecs_1 = database.data(); + auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim); + + auto cuann_desc_1 = ivf_pq::cuannIvfPqCreateDescriptor(); // Number of kmeans clusters. // @@ -201,20 +206,20 @@ class IvfPqTest : public ::testing::TestWithParam { ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; // ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_CLUSTER; ivf_pq::cuannIvfPqSetIndexParameters( - cuann_desc, - n_clusters, /* Number of clusters */ - uint32_t(ps.num_db_vecs), /* Number of dataset entries */ - uint32_t(ps.dim), /* Dimension of each entry */ - dimPq, /* Dimension of each entry after product quantization */ - bitPq, /* Bit length of PQ */ + cuann_desc_1, + n_clusters, /* Number of clusters */ + size_1, /* Number of dataset entries */ + uint32_t(ps.dim), /* Dimension of each entry */ + dimPq, /* Dimension of each entry after product quantization */ + bitPq, /* Bit length of PQ */ similarity, typePqCenter); // Build index ivf_pq::cuannIvfPqBuildIndex( handle_, - cuann_desc, - database.data(), // dataset + cuann_desc_1, + vecs_1, // dataset database.data(), // ?kmeans? trainset uint32_t(ps.num_db_vecs), // size of the trainset (I guess for kmeans) numIterations, @@ -223,8 +228,11 @@ class IvfPqTest : public ::testing::TestWithParam { ); handle_.sync_stream(stream_); + auto cuann_desc_2 = ivf_pq::cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( + handle_, cuann_desc_1, vecs_2, size_2); + // set search parameters - ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc, ps.nprobe, ps.k); + ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc_2, ps.nprobe, ps.k); // Data type of LUT to be created dynamically at search time. // // The use of low-precision types reduces the amount of shared memory @@ -249,7 +257,7 @@ class IvfPqTest : public ::testing::TestWithParam { // uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 ivf_pq::cuannIvfPqSetSearchTuningParameters( - cuann_desc, internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); + cuann_desc_2, internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); // Maximum number of query vectors to search at the same time. uint32_t batchSize = std::min(ps.num_queries, 32768); // Maximum device memory size that may be used as workspace at search time. @@ -259,12 +267,12 @@ class IvfPqTest : public ::testing::TestWithParam { // Allocate memory for index size_t ivf_pq_search_workspace_size; ivf_pq::cuannIvfPqSearch_bufferSize( - handle_, cuann_desc, batchSize, maxSearchWorkspaceSize, &ivf_pq_search_workspace_size); + handle_, cuann_desc_2, batchSize, maxSearchWorkspaceSize, &ivf_pq_search_workspace_size); rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); // finally, search! cuannIvfPqSearch(handle_, - cuann_desc, + cuann_desc_2, search_queries.data(), ps.num_queries, indices_ivf_pq_dev.data(), From 5822844f60367b434436eb673ee49736952a9d5c Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 15 Aug 2022 10:09:42 +0200 Subject: [PATCH 017/140] Replace cuann similarity with raft distance metric --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 107 +++++++++++------------- cpp/test/spatial/ann_ivf_pq.cu | 6 +- 2 files changed, 51 insertions(+), 62 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index e59f14fc31..4dda3a0b2d 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -119,12 +120,6 @@ extern __shared__ float smemArray[]; #define FP16_MAX 65504.0 -/* CUANN similarity type */ -typedef enum { - CUANN_SIMILARITY_INNER = 0, - CUANN_SIMILARITY_L2 = 1, -} cuannSimilarity_t; - /* CUANN PQ center type */ typedef enum { CUANN_PQ_CENTER_PER_SUBSPACE = 0, @@ -140,7 +135,7 @@ struct cuannIvfPqDescriptor { uint32_t dimRotDataset; uint32_t dimPq; uint32_t bitPq; - cuannSimilarity_t similarity; + distance::DistanceType metric; cuannPqCenter_t typePqCenter; cudaDataType_t dtypeDataset; cudaDataType_t internalDistanceDtype; @@ -207,7 +202,7 @@ struct cuannIvfPqIndexHeader { uint32_t numDataset; uint32_t dimDataset; uint32_t dimPq; - uint32_t similarity; + uint32_t metric; uint32_t maxClusterSize; uint32_t dimRotDataset; uint32_t bitPq; @@ -597,7 +592,7 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen cudaDataType_t dtype, uint32_t numDataset, uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, + distance::DistanceType metric, uint32_t* clusterSize, // [numCenters] float* accumulatedCenters) { @@ -621,7 +616,7 @@ inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCen centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault)); } - if (similarity == CUANN_SIMILARITY_INNER) { + if (metric == distance::DistanceType::InnerProduct) { // normalize _cuann_normalize(numCenters, dimCenters, centers, clusterSize); } else { @@ -672,7 +667,7 @@ inline void _cuann_kmeans_predict(const handle_t& handle, cudaDataType_t dtype, uint32_t numDataset, uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, + distance::DistanceType metric, bool isCenterSet, void* _workspace, float* tempCenters, // [numCenters, dimCenters] @@ -693,7 +688,7 @@ inline void _cuann_kmeans_predict(const handle_t& handle, dtype, numDataset, labels, - similarity, + metric, clusterSize, nullptr); } @@ -736,8 +731,6 @@ inline void _cuann_kmeans_predict(const handle_t& handle, RAFT_LOG_DEBUG("_cuann_kmeans_predict: using pool memory resource with initial size %zu bytes", pool_guard->pool_size()); } - auto metric = similarity == CUANN_SIMILARITY_INNER ? raft::distance::DistanceType::InnerProduct - : raft::distance::DistanceType::L2Expanded; for (uint64_t is = 0; is < numDataset; is += chunk) { uint64_t ie = min(is + chunk, (uint64_t)numDataset); @@ -818,7 +811,7 @@ inline void _cuann_kmeans_predict(const handle_t& handle, dtype, numDataset, labels, - similarity, + metric, clusterSize, tempCenters); } @@ -837,7 +830,7 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, cudaDataType_t dtype, uint32_t numDataset, uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, + distance::DistanceType metric, bool isCenterSet, uint32_t* clusterSize, // [numCenters] bool updateCenter // If true, cluster Centers will be updated. @@ -891,7 +884,7 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, dtype, nDataset, labels + d0, - similarity, + metric, isCenterSet, predictWorkspaceMP[devId], clusterCentersMP[devId], @@ -924,7 +917,7 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, dtype, numDataset, labels, - similarity, + metric, clusterSize, clusterCentersMP[orgDevId]); } @@ -945,7 +938,7 @@ inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenter cudaDataType_t dtype, uint32_t numDataset, uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity) + distance::DistanceType metric) { float multiplier = 1.0; if (dtype == CUDA_R_8U) { @@ -970,7 +963,7 @@ inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenter for (uint32_t l = 0; l < numCenters; l++) { float score = 0.0; for (uint32_t j = 0; j < dimCenters; j++) { - if (similarity == CUANN_SIMILARITY_INNER) { + if (metric == distance::DistanceType::InnerProduct) { score -= vector[j] * centers[j + (dimCenters * l)]; } else { float diff = vector[j] - centers[j + (dimCenters * l)]; @@ -996,7 +989,7 @@ __global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] const void* _dataset, // [numDataet, dimCenters] uint32_t numDataset, const uint32_t* labels, // [numDataset] - cuannSimilarity_t similarity, + distance::DistanceType metric, const uint32_t* clusterSize, // [numCenters] float threshold, uint32_t average, @@ -1027,7 +1020,7 @@ __global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] sqsum += val * val; centers[j + (uint64_t)dimCenters * l] = val; } - if (similarity == CUANN_SIMILARITY_INNER) { + if (metric == distance::DistanceType::InnerProduct) { sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); @@ -2788,7 +2781,7 @@ inline void cuannIvfPqSetIndexParameters( const uint32_t dimDataset, /* Dimension of each entry */ const uint32_t dimPq, /* Dimension of each entry after product quantization */ const uint32_t bitPq, /* Bit length of PQ */ - const cuannSimilarity_t similarity, + const distance::DistanceType metric, const cuannPqCenter_t typePqCenter); inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, @@ -2797,7 +2790,7 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, uint32_t* dimDataset, uint32_t* dimPq, uint32_t* bitPq, - cuannSimilarity_t* similarity, + distance::DistanceType* metric, cuannPqCenter_t* typePqCenter); inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, @@ -3278,7 +3271,7 @@ void _cuann_compute_PQ_code(const handle_t& handle, CUDA_R_32F, numTrainset, rotVectorLabels[devId], - CUANN_SIMILARITY_L2, + raft::distance::DistanceType::L2Expanded, (iter != 0), pqPredictWorkspace[devId], myPqCentersTemp[devId], @@ -3339,7 +3332,7 @@ void _cuann_compute_PQ_code(const handle_t& handle, CUDA_R_32F, clusterSize[l], subVectorLabels[devId] + j * clusterSize[l], - CUANN_SIMILARITY_L2, + raft::distance::DistanceType::L2Expanded, true, pqPredictWorkspace[devId], nullptr, @@ -3383,7 +3376,7 @@ inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, const uint32_t dimDataset, const uint32_t dimPq, const uint32_t bitPq, - const cuannSimilarity_t similarity, + const distance::DistanceType metric, const cuannPqCenter_t typePqCenter) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); @@ -3415,7 +3408,7 @@ inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, RAFT_EXPECTS(desc->dimDatasetExt % 8 == 0, "unexpected dimDatasetExt"); desc->dimPq = dimPq; desc->bitPq = bitPq; - desc->similarity = similarity; + desc->metric = metric; desc->typePqCenter = typePqCenter; desc->dimRotDataset = dimDataset; @@ -3430,7 +3423,7 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, uint32_t* dimDataset, uint32_t* dimPq, uint32_t* bitPq, - cuannSimilarity_t* similarity, + distance::DistanceType* metric, cuannPqCenter_t* typePqCenter) { RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); @@ -3440,7 +3433,7 @@ inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, *dimDataset = desc->dimDataset; *dimPq = desc->dimPq; *bitPq = desc->bitPq; - *similarity = desc->similarity; + *metric = desc->metric; *typePqCenter = desc->typePqCenter; } @@ -3486,7 +3479,7 @@ void cuannIvfPqBuildIndex( std::is_same_v || std::is_same_v || std::is_same_v, "unsupported type"); } - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { RAFT_EXPECTS(dtype == CUDA_R_32F, "Unsupported dtype (inner-product metric support float only)"); } @@ -3585,7 +3578,7 @@ void cuannIvfPqBuildIndex( dtype, numTrainset, mesoClusterLabels, - desc->similarity, + desc->metric, (iter != 0), NULL, mesoClusterCentersTemp, @@ -3602,7 +3595,7 @@ void cuannIvfPqBuildIndex( device_memory, handle.get_stream())) { iter -= 1; - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { detail::utils::normalize_rows( numMesoClusters, desc->dimDataset, mesoClusterCenters, handle.get_stream()); } @@ -3707,7 +3700,7 @@ void cuannIvfPqBuildIndex( CUDA_R_32F, mesoClusterSize[i], labelsMP[devId], - desc->similarity, + desc->metric, (iter != 0), predictWorkspace[devId], clusterCentersMP[devId], @@ -3725,7 +3718,7 @@ void cuannIvfPqBuildIndex( device_memory, handle.get_stream())) { iter -= 1; - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { detail::utils::normalize_rows( numFineClusters[i], desc->dimDataset, clusterCentersEach[devId], handle.get_stream()); } @@ -3778,7 +3771,7 @@ void cuannIvfPqBuildIndex( dtype, numTrainset, trainsetLabels, - desc->similarity, + desc->metric, true, clusterSize, true /* to update clusterCenters */); @@ -3793,7 +3786,7 @@ void cuannIvfPqBuildIndex( device_memory, handle.get_stream())) { iter -= (X - 1); - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { detail::utils::normalize_rows( desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); } @@ -3811,7 +3804,7 @@ void cuannIvfPqBuildIndex( dtype, numTrainset, trainsetLabels, - desc->similarity, + desc->metric, (iter != 0), NULL, clusterCentersTemp, @@ -3828,7 +3821,7 @@ void cuannIvfPqBuildIndex( device_memory, handle.get_stream())) { iter -= 1; - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { detail::utils::normalize_rows( desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); } @@ -3850,7 +3843,7 @@ void cuannIvfPqBuildIndex( dtype, desc->numDataset, datasetLabels, - desc->similarity, + desc->metric, true, clusterSize, true /* to update clusterCenters */); @@ -3944,7 +3937,7 @@ void cuannIvfPqBuildIndex( dtype, numTrainset, trainsetLabels, - desc->similarity, + desc->metric, true, NULL, false /* do not update clusterCenters */); @@ -4036,7 +4029,7 @@ void cuannIvfPqBuildIndex( CUDA_R_32F, numTrainset, subTrainsetLabels[devId], - CUANN_SIMILARITY_L2, + raft::distance::DistanceType::L2Expanded, (iter != 0), pqPredictWorkspace[devId], pqCentersTemp[devId], @@ -4126,7 +4119,7 @@ void cuannIvfPqBuildIndex( // cuannIvfPqGetIndexSize(desc, &(header->indexSize)); - header->similarity = desc->similarity; + header->metric = desc->metric; header->numClusters = desc->numClusters; header->numDataset = desc->numDataset; header->dimDataset = desc->dimDataset; @@ -4206,7 +4199,7 @@ inline void cuannIvfPqLoadIndex(const handle_t& handle, desc->numDataset = header->numDataset; desc->dimDataset = header->dimDataset; desc->dimPq = header->dimPq; - desc->similarity = (cuannSimilarity_t)header->similarity; + desc->metric = (distance::DistanceType)header->metric; desc->maxClusterSize = header->maxClusterSize; desc->dimRotDataset = header->dimRotDataset; desc->lenPq = desc->dimRotDataset / desc->dimPq; @@ -4366,7 +4359,7 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( dtype, numNewVectors, newVectorLabels, - oldDesc->similarity, + oldDesc->metric, true, clusterSize, false /* do not update clusterCenters */); @@ -4860,7 +4853,7 @@ void cuannIvfPqSearch(const handle_t& handle, uint32_t nQueries = min(desc->maxQueries, numQueries - i); float fillValue = 0.0; - if (desc->similarity == CUANN_SIMILARITY_L2) { fillValue = 1.0 / -2.0; } + if (desc->metric != raft::distance::DistanceType::InnerProduct) { fillValue = 1.0 / -2.0; } float divisor = 1.0; if (desc->dtypeDataset == CUDA_R_8U) { divisor = 256.0; @@ -4929,7 +4922,7 @@ void cuannIvfPqSearch(const handle_t& handle, float alpha; float beta; uint32_t gemmK = desc->dimDataset; - if (desc->similarity == CUANN_SIMILARITY_INNER) { + if (desc->metric == distance::DistanceType::InnerProduct) { alpha = -1.0; beta = 0.0; } else { @@ -5490,9 +5483,9 @@ __device__ inline void update_approx_global_score(uint32_t topk, // template -__device__ inline outDtype get_out_score(float score, cuannSimilarity_t similarity) +__device__ inline outDtype get_out_score(float score, distance::DistanceType metric) { - if (similarity == CUANN_SIMILARITY_INNER) { score = score / 2.0 - 1.0; } + if (metric == distance::DistanceType::InnerProduct) { score = score / 2.0 - 1.0; } if (sizeof(outDtype) == 2) { score = min(score, FP16_MAX); } return (outDtype)score; } @@ -5516,7 +5509,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( uint32_t dimPq, uint32_t sizeBatch, uint32_t maxSamples, - cuannSimilarity_t similarity, + distance::DistanceType metric, cuannPqCenter_t typePqCenter, uint32_t topk, const float* clusterCenters, // [numClusters, dimDataset,] @@ -5631,7 +5624,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); } if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, similarity); } + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } } else { uint32_t val = i; block_topk.add(score, val); @@ -5645,7 +5638,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( if (warp_id == 0) { for (int j = 0; j < depth; j++) { if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), similarity); + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; } } @@ -5670,7 +5663,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( uint32_t dimPq, uint32_t sizeBatch, uint32_t maxSamples, - cuannSimilarity_t similarity, + distance::DistanceType metric, cuannPqCenter_t typePqCenter, uint32_t topk, const float* clusterCenters, // [numClusters, dimDataset,] @@ -5784,7 +5777,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); } if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, similarity); } + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } } else { uint32_t val = i; block_topk.add(score, val); @@ -5801,7 +5794,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( if (warp_id == 0) { for (int j = 0; j < depth; j++) { if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), similarity); + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; } } @@ -5985,7 +5978,7 @@ inline void ivfpq_search(const handle_t& handle, uint32_t, uint32_t, uint32_t, - cuannSimilarity_t, + distance::DistanceType, cuannPqCenter_t, uint32_t, const float*, @@ -6097,7 +6090,7 @@ inline void ivfpq_search(const handle_t& handle, desc->dimPq, numQueries, desc->maxSamples, - desc->similarity, + desc->metric, desc->typePqCenter, desc->topK, clusterCenters, diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index b628a708df..17fafa2b30 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -198,10 +198,6 @@ class IvfPqTest : public ::testing::TestWithParam { bool randomRotation = ps.dim < 1024; // disable for large-dimensional data (CPU intensive) // Number of iterations for kmeans training. uint32_t numIterations = 25; - // metric - ivf_pq::cuannSimilarity_t similarity = - ps.metric == raft::distance::DistanceType::InnerProduct ? ivf_pq::CUANN_SIMILARITY_INNER - : ivf_pq::CUANN_SIMILARITY_L2; // Specify whether PQ codebooks are created per subspace or per cluster. ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; // ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_CLUSTER; @@ -212,7 +208,7 @@ class IvfPqTest : public ::testing::TestWithParam { uint32_t(ps.dim), /* Dimension of each entry */ dimPq, /* Dimension of each entry after product quantization */ bitPq, /* Bit length of PQ */ - similarity, + ps.metric, typePqCenter); // Build index From 62d32a735289082a738d280347ec71be72e1e906 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 15 Aug 2022 10:29:04 +0200 Subject: [PATCH 018/140] Use raft logging levels instead of CUANN_DEBUG macro --- cpp/include/raft/spatial/knn/ivf_pq.cuh | 50 +++++++++++++------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 4dda3a0b2d..9ae4fca131 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,8 +37,6 @@ ////////////////// -#define CUANN_DEBUG - namespace raft::spatial::knn::ivf_pq { /** @@ -2071,7 +2069,7 @@ inline void _cuann_find_topk(const handle_t& handle, constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; static_assert(stateBitLen == 0 || stateBitLen == 8); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) RAFT_CUDA_TRY( cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); #endif @@ -2205,7 +2203,7 @@ inline void _cuann_find_topk(const handle_t& handle, constexpr int numThreads = NUM_THREADS; constexpr int stateBitLen = STATE_BIT_LENGTH; static_assert(stateBitLen == 0 || stateBitLen == 8); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) RAFT_CUDA_TRY( cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); #endif @@ -3047,6 +3045,7 @@ inline void _cuann_kmeans_show_centers(const float* centers, // [numCenters, di const uint32_t* centerSize, const uint32_t numShow = 5) { +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) for (uint64_t k = 0; k < numCenters; k++) { if ((numShow <= k) && (k < numCenters - numShow)) { if (k == numShow) fprintf(stderr, "...\n"); @@ -3062,6 +3061,7 @@ inline void _cuann_kmeans_show_centers(const float* centers, // [numCenters, di } fprintf(stderr, " %d\n", centerSize[k]); } +#endif } // show dataset (for debugging) @@ -3070,6 +3070,7 @@ inline void _cuann_show_dataset(const float* dataset, // [numDataset, dimDatase uint32_t dimDataset, const uint32_t numShow = 5) { +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { if (i == numShow) fprintf(stderr, "...\n"); @@ -3085,6 +3086,7 @@ inline void _cuann_show_dataset(const float* dataset, // [numDataset, dimDatase } fprintf(stderr, "\n"); } +#endif } // show pq code (for debuging) @@ -3093,6 +3095,7 @@ inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq uint32_t dimPq, const uint32_t numShow = 5) { +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) for (uint64_t i = 0; i < numDataset; i++) { if ((numShow <= i) && (i < numDataset - numShow)) { if (i == numShow) fprintf(stderr, "...\n"); @@ -3108,6 +3111,7 @@ inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq } fprintf(stderr, "\n"); } +#endif } // @@ -3848,7 +3852,7 @@ void cuannIvfPqBuildIndex( clusterSize, true /* to update clusterCenters */); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) RAFT_CUDA_TRY(cudaDeviceSynchronize()); _cuann_kmeans_show_centers(clusterCenters, desc->numClusters, desc->dimDataset, clusterSize); #endif @@ -4052,7 +4056,7 @@ void cuannIvfPqBuildIndex( pqCentersEach[devId], sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), cudaMemcpyDeviceToDevice)); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) if (j == 0) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); _cuann_kmeans_show_centers( @@ -4364,8 +4368,8 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( clusterSize, false /* do not update clusterCenters */); -#ifdef CUANN_DEBUG - if (1) { +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + { const int _num_show = 10; fprintf(stderr, "# numNewVectors: %u\n", numNewVectors); fprintf(stderr, "# newVectorLabels: "); @@ -4378,7 +4382,7 @@ auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( } fprintf(stderr, "\n"); } - if (1) { + { const int _num_show = 10; fprintf(stderr, "# oldDesc->numClusters: %u\n", oldDesc->numClusters); fprintf(stderr, "# clusterSize: "); @@ -5892,7 +5896,7 @@ inline void ivfpq_search(const handle_t& handle, dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); ivfpq_init_topkScores<<>>( topkScores, FLT_MAX, numQueries * desc->topK); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif } @@ -5902,7 +5906,7 @@ inline void ivfpq_search(const handle_t& handle, dim3 mcBlocks(numQueries, 1, 1); ivfpq_make_chunk_index_ptr<<>>( desc->numProbes, numQueries, cluster_offsets, clusterLabelsToProbe, chunkIndexPtr, numSamples); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif @@ -5915,7 +5919,7 @@ inline void ivfpq_search(const handle_t& handle, dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); ivfpq_prep_sort<<>>(numQueries * desc->numProbes, indexList); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif @@ -5931,7 +5935,7 @@ inline void ivfpq_search(const handle_t& handle, begin_bit, end_bit, handle.get_stream()); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif } else { @@ -6005,12 +6009,12 @@ inline void ivfpq_search(const handle_t& handle, case 4: SET_KERNEL3(4); break; default: RAFT_FAIL("ivf_pq::search(k = %u): depth value is too big (%d)", desc->topK, depth); } - RAFT_LOG_INFO("ivf_pq::search(k = %u, depth = %d, dim = %u/%u/%u)", - desc->topK, - depth, - desc->dimDataset, - desc->dimRotDataset, - desc->dimPq); + RAFT_LOG_DEBUG("ivf_pq::search(k = %u, depth = %d, dim = %u/%u/%u)", + desc->topK, + depth, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq); constexpr size_t thresholdSmem = 48 * 1024; size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); size_t sizeSmemBaseDiff = sizeof(float) * desc->dimRotDataset; @@ -6105,7 +6109,7 @@ inline void ivfpq_search(const handle_t& handle, topkScores, (scoreDtype*)similarity, simTopkIndex); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif @@ -6129,7 +6133,7 @@ inline void ivfpq_search(const handle_t& handle, topkSids, topkWorkspace); } -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif @@ -6150,7 +6154,7 @@ inline void ivfpq_search(const handle_t& handle, topkSids, topkNeighbors, topkDistances); -#ifdef CUANN_DEBUG +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) handle.sync_stream(); #endif } From 48c87025c9d539391fc214018f6c766a08cb29ba Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 15 Aug 2022 15:01:14 +0200 Subject: [PATCH 019/140] Made the API scaffold. --- .../raft/spatial/knn/detail/ivf_pq_build.cuh | 98 + .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 6091 ++++++++++++++++ .../raft/spatial/knn/detail/ivf_pq_search.cuh | 92 + cpp/include/raft/spatial/knn/ivf_pq.cuh | 6266 +---------------- cpp/include/raft/spatial/knn/ivf_pq_types.hpp | 263 + cpp/test/spatial/ann_ivf_pq.cu | 133 +- 6 files changed, 6716 insertions(+), 6227 deletions(-) create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh create mode 100644 cpp/include/raft/spatial/knn/ivf_pq_types.hpp diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh new file mode 100644 index 0000000000..70fa36b9f8 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../ivf_pq_types.hpp" +#include "ann_kmeans_balanced.cuh" +#include "ann_utils.cuh" +#include "ivf_pq_legacy.cuh" + +#include +#include +#include +#include +#include + +#include + +namespace raft::spatial::knn::ivf_pq::detail { + +using namespace raft::spatial::knn::detail; // NOLINT + +/** See raft::spatial::knn::ivf_pq::extend docs */ +template +inline auto extend(const handle_t& handle, + const index& orig_index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) -> index +{ + common::nvtx::range fun_scope( + "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim()); + + if (new_indices != nullptr) { + RAFT_LOG_WARN("Index input is ignored at the moment (non-null new_indices given)."); + } + + ivf_pq::index new_index( + handle, orig_index.metric(), orig_index.n_lists(), orig_index.dim(), orig_index.pq_dim()); + new_index.desc() = ivf_pq::detail::cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( + handle, const_cast(orig_index.desc()), new_vectors, n_rows); + + return new_index; +} + +/** See raft::spatial::knn::ivf_pq::build docs */ +template +inline auto build( + const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) + -> index +{ + common::nvtx::range fun_scope( + "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); + static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported data type"); + RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset"); + + ivf_pq::index index(handle, params.metric, params.n_lists, dim, params.pq_dim); + + ivf_pq::detail::cuannIvfPqSetIndexParameters( + index.desc(), + index.n_lists(), /* Number of clusters */ + (uint32_t)n_rows, /* Number of dataset entries */ + index.dim(), /* Dimension of each entry */ + index.pq_dim(), /* Dimension of each entry after product quantization */ + params.pq_bits, /* Bit length of PQ */ + index.metric(), + params.codebook_kind); + + // Build index + ivf_pq::detail::cuannIvfPqBuildIndex( + handle, + index.desc(), + dataset, // dataset + dataset, // ?kmeans? trainset + uint32_t(params.add_data_on_build ? n_rows : 0), // size of the trainset (I guess for kmeans) + params.kmeans_n_iters, + params.random_rotation, + true // hierarchialClustering: always true in raft + ); + + return index; +} + +} // namespace raft::spatial::knn::ivf_pq::detail diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh new file mode 100644 index 0000000000..d46e3d3443 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -0,0 +1,6091 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../ivf_pq_types.hpp" +#include "ann_kmeans_balanced.cuh" +#include "ann_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/////////////////// +#include +#include +#include +#include + +////////////////// + +namespace raft::spatial::knn::ivf_pq::detail { + +using namespace raft::spatial::knn::detail; // NOLINT + +/** + * + * + * + * + * + * fp_8bit + */ + +template +struct fp_8bit; + +template +__device__ __host__ fp_8bit __float2fp_8bit(const float v); +template +__device__ __host__ float __fp_8bit2float(const fp_8bit& v); + +template +struct fp_8bit { + uint8_t bitstring; + + __device__ __host__ fp_8bit(const uint8_t bs) { bitstring = bs; } + __device__ __host__ fp_8bit(const float fp) + { + bitstring = __float2fp_8bit(fp).bitstring; + } + __device__ __host__ fp_8bit& operator=(const float fp) + { + bitstring = __float2fp_8bit(fp).bitstring; + return *this; + } + + __device__ __host__ operator float() const { return __fp_8bit2float(*this); } +}; + +// Since __float_as_uint etc can not be used in host codes, +// these converters are needed for test. +union cvt_fp_32bit { + float fp; + uint32_t bs; +}; +union cvt_fp_16bit { + half fp; + uint16_t bs; +}; + +// Type converters +template +__device__ __host__ fp_8bit __float2fp_8bit(const float v) +{ + if (v < 1. / (1u << ((1u << (expBitLen - 1)) - 1))) + return fp_8bit{static_cast(0)}; + return fp_8bit{static_cast( + (cvt_fp_32bit{.fp = v}.bs + (((1u << (expBitLen - 1)) - 1) << 23) - 0x3f800000u) >> + (15 + expBitLen))}; +} + +template +__device__ __host__ float __fp_8bit2float(const fp_8bit& v) +{ + return cvt_fp_32bit{.bs = ((v.bitstring << (15 + expBitLen)) + + (0x3f800000u | (0x00400000u >> (8 - expBitLen))) - + (((1u << (expBitLen - 1)) - 1) << 23))} + .fp; +} + +/** + * + * end of fp8bit + * + */ + +using namespace cub; + +// +extern __shared__ float smemArray[]; + +#define FP16_MAX 65504.0 + +// header of index +struct cuannIvfPqIndexHeader { + // (*) DO NOT CHANGE ORDER + size_t indexSize; + uint32_t version; + uint32_t numClusters; + uint32_t numDataset; + uint32_t dimDataset; + uint32_t dimPq; + uint32_t metric; + uint32_t maxClusterSize; + uint32_t dimRotDataset; + uint32_t bitPq; + uint32_t typePqCenter; + uint32_t dtypeDataset; + uint32_t dimDatasetExt; + uint32_t numDatasetAdded; + uint32_t _dummy[256 - 15]; +}; + +// +inline char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) +{ + if (dtype == CUDA_R_32F) + sprintf(string, "float (CUDA_R_32F)"); + else if (dtype == CUDA_R_16F) + sprintf(string, "half (CUDA_R_16F)"); + else if (dtype == CUDA_R_8U) + sprintf(string, "uint8 (CUDA_R_8U)"); + else if (dtype == CUDA_R_8I) + sprintf(string, "int8 (CUDA_R_8I)"); + else + sprintf(string, "unknown"); + return string; +} + +// copy +template +__global__ void kern_copy(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D divisor) +{ + uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint32_t iCol = gid % nCols; + uint32_t iRow = gid / nCols; + if (iRow >= nRows) return; + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; +} + +// copy +template +inline void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D divisor) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; + kern_copy<<>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor); +} + +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const float* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const uint32_t* src, + uint32_t ldSrc, + uint8_t* dst, + uint32_t ldDst, + uint8_t divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const uint8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); +template void _cuann_copy(uint32_t nRows, + uint32_t nCols, + const int8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float divisor); + +// copy_fill +template +__global__ void kern_copy_fill(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D fillValue, + D divisor) +{ + uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint32_t iCol = gid % ldDst; + uint32_t iRow = gid / ldDst; + if (iRow >= nRows) return; + if (iCol < nCols) { + dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; + } else { + dst[iCol + (ldDst * iRow)] = fillValue; + } +} + +// copy_fill +template +inline void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const S* src, // [nRows, ldSrc] + uint32_t ldSrc, + D* dst, // [nRows, ldDst] + uint32_t ldDst, + D fillValue, + D divisor, + cudaStream_t stream) +{ + RAFT_EXPECTS(ldSrc >= nCols, "src leading dimension must be larger than nCols"); + RAFT_EXPECTS(ldDst >= nCols, "dist leading dimension must be larger than nCols"); + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * ldDst) + nThreads - 1) / nThreads; + kern_copy_fill + <<>>(nRows, nCols, src, ldSrc, dst, ldDst, fillValue, divisor); +} + +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const float* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const uint8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); +template void _cuann_copy_fill(uint32_t nRows, + uint32_t nCols, + const int8_t* src, + uint32_t ldSrc, + float* dst, + uint32_t ldDst, + float fillValue, + float divisor, + cudaStream_t stream); + +// a -= b +__global__ void kern_a_me_b(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + uint32_t ldA, + float* b // [nCols] +) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iCol = gid % nCols; + uint64_t iRow = gid / nCols; + if (iRow >= nRows) return; + a[iCol + (ldA * iRow)] -= b[iCol]; +} + +// a -= b +inline void _cuann_a_me_b(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + uint32_t ldA, + float* b // [nCols] +) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; + kern_a_me_b<<>>(nRows, nCols, a, ldA, b); +} + +// normalize +__global__ void kern_normalize(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x); + if (iRow >= nRows) return; + if (numSamples != NULL and numSamples[iRow] < 1) return; + + float sqsum = 0.0; + for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + float val = a[iCol + (nCols * iRow)]; + sqsum += val * val; + } + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); + sqsum = sqrt(sqsum); + for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { + a[iCol + (nCols * iRow)] /= sqsum; + } +} + +// normalize +inline void _cuann_normalize(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples = nullptr // [nRows,] +) +{ + dim3 threads(32, 4, 1); // DO NOT CHANGE + dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1); + kern_normalize<<>>(nRows, nCols, a, numSamples); +} + +// divide +__global__ void kern_divide(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); + uint64_t iRow = gid / nCols; + if (iRow >= nRows) return; + if (numSamples[iRow] == 0) return; + a[gid] /= numSamples[iRow]; +} + +// divide +inline void _cuann_divide(uint32_t nRows, + uint32_t nCols, + float* a, // [nRows, nCols] + const uint32_t* numSamples // [nRows,] +) +{ + dim3 threads(128, 1, 1); + dim3 blocks(((uint64_t)nRows * nCols + threads.x - 1) / threads.x, 1, 1); + kern_divide<<>>(nRows, nCols, a, numSamples); +} + +// +template +__global__ void kern_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + D* dst, // [num2, ld1, ld0] + uint32_t ld0, + uint32_t ld1, + const S* src, // [...] + uint32_t stride0, + uint32_t stride1, + uint32_t stride2) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid >= num0 * num1 * num2) return; + uint32_t i0 = tid % num0; + uint32_t i1 = (tid / num0) % num1; + uint32_t i2 = (tid / num0) / num1; + + dst[i0 + (ld0 * i1) + (ld0 * ld1 * i2)] = src[(stride0 * i0) + (stride1 * i1) + (stride2 * i2)]; +} + +// transpose_copy_3d +template +inline void _cuann_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + D* dst, // [num2, ld1, ld0] + uint32_t ld0, + uint32_t ld1, + const S* src, // [...] + uint32_t stride0, + uint32_t stride1, + uint32_t stride2) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = ((num0 * num1 * num2) + nThreads - 1) / nThreads; + kern_transpose_copy_3d + <<>>(num0, num1, num2, dst, ld0, ld1, src, stride0, stride1, stride2); +} + +template void _cuann_transpose_copy_3d(uint32_t num0, + uint32_t num1, + uint32_t num2, + float* dst, + uint32_t ld0, + uint32_t ld1, + const float* src, + uint32_t stride0, + uint32_t stride1, + uint32_t stride2); + +// +template +__global__ void kern_axpy(int num, T alpha, const T* x, T* y) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid >= num) return; + y[tid] += alpha * x[tid]; +} + +// +template +inline void _cuann_axpy(int num, T alpha, const T* x, T* y) +{ + uint32_t nThreads = 128; + uint32_t nBlocks = (num + nThreads - 1) / nThreads; + kern_axpy<<>>(num, alpha, x, y); +} + +template void _cuann_axpy(int num, float alpha, const float* x, float* y); +template void _cuann_axpy(int num, uint32_t alpha, const uint32_t* x, uint32_t* y); + +// +template +T** _cuann_multi_device_malloc(int numDevices, + size_t numArrayElements, + const char* arrayName, + bool useCudaMalloc = false // If true, cudaMalloc() used, + // otherwise, cudaMallocManaged() used. +) +{ + int orgDevId; + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); + T** arrays = (T**)malloc(sizeof(T*) * numDevices); + for (int devId = 0; devId < numDevices; devId++) { + RAFT_CUDA_TRY(cudaSetDevice(devId)); + if (useCudaMalloc) { + RAFT_CUDA_TRY(cudaMalloc(&(arrays[devId]), sizeof(T) * numArrayElements)); + } else { + RAFT_CUDA_TRY(cudaMallocManaged(&(arrays[devId]), sizeof(T) * numArrayElements)); + } + } + RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); + return arrays; +} + +// multi_device_free +template +inline void _cuann_multi_device_free(T** arrays, int numDevices) +{ + for (int devId = 0; devId < numDevices; devId++) { + RAFT_CUDA_TRY(cudaFree(arrays[devId])); + } + free(arrays); +} + +template void _cuann_multi_device_free(float** arrays, int numDevices); +template void _cuann_multi_device_free(uint32_t** arrays, int numDevices); +template void _cuann_multi_device_free(uint8_t** arrays, int numDevices); + +/** + * End of utils + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * start of kmeans + */ + +// update kmeans centers +inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + distance::DistanceType metric, + uint32_t* clusterSize, // [numCenters] + float* accumulatedCenters) +{ + auto stream = rmm::cuda_stream_default; + if (accumulatedCenters == NULL) { + // accumulate + utils::memzero(centers, numCenters * dimCenters, stream); + utils::memzero(clusterSize, numCenters, stream); + if (dtype == CUDA_R_32F) { + utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const float*)dataset, labels, stream); + } else if (dtype == CUDA_R_8U) { + utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const uint8_t*)dataset, labels, stream); + } else if (dtype == CUDA_R_8I) { + utils::accumulate_into_selected( + numDataset, dimCenters, centers, clusterSize, (const int8_t*)dataset, labels, stream); + } + } else { + RAFT_CUDA_TRY(cudaMemcpy( + centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault)); + } + + if (metric == distance::DistanceType::InnerProduct) { + // normalize + _cuann_normalize(numCenters, dimCenters, centers, clusterSize); + } else { + // average + _cuann_divide(numCenters, dimCenters, centers, clusterSize); + } +} + +// +uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset) +{ + uint32_t chunk = (1 << 20); + if (chunk > (1 << 28) / numCenters) { + chunk = (1 << 28) / numCenters; + if (chunk > 31) { + chunk += 32; + chunk -= chunk % 64; + } else { + chunk = 64; + } + } + chunk = min(chunk, numDataset); + return chunk; +} + +// +inline size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, + uint32_t dimCenters, + uint32_t numDataset) +{ + uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); + size_t size = 0; + // float *curDataset; // [chunk, dimCenters] + size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); + // void *bufDataset; // [chunk, dimCenters] + size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); + // float *workspace; + size += Pow2<128>::roundUp(sizeof(float) * (numCenters + chunk + (numCenters * chunk))); + return size; +} + +// predict label of dataset +inline void _cuann_kmeans_predict(const handle_t& handle, + float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + distance::DistanceType metric, + bool isCenterSet, + void* _workspace, + float* tempCenters, // [numCenters, dimCenters] + uint32_t* clusterSize, // [numCenters,] + bool updateCenter) +{ + if (!isCenterSet) { + // If centers are not set, the labels will be determined randomly. + for (uint32_t i = 0; i < numDataset; i++) { + labels[i] = i % numCenters; + } + if (tempCenters != NULL && clusterSize != NULL) { + // update centers + _cuann_kmeans_update_centers(centers, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + metric, + clusterSize, + nullptr); + } + return; + } + + uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); + void* workspace = _workspace; + if (_workspace == NULL) { + size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); + RAFT_CUDA_TRY(cudaMallocManaged(&workspace, sizeWorkspace)); + } + float* curDataset; // [chunk, dimCenters] + void* bufDataset; // [chunk, dimCenters] + // float* workspace_core; + curDataset = (float*)workspace; + bufDataset = + (void*)((uint8_t*)curDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); + // workspace_core = + // (float*)((uint8_t*)bufDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); + + auto stream = handle.get_stream(); + if (tempCenters != NULL && clusterSize != NULL) { + utils::memzero(tempCenters, numCenters * dimCenters, stream); + utils::memzero(clusterSize, numCenters, stream); + } + + cudaMemcpyKind kind; + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); + if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { + kind = cudaMemcpyDeviceToDevice; + } else { + kind = cudaMemcpyHostToDevice; + } + + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, numCenters * chunk); + if (pool_guard) { + RAFT_LOG_DEBUG("_cuann_kmeans_predict: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + for (uint64_t is = 0; is < numDataset; is += chunk) { + uint64_t ie = min(is + chunk, (uint64_t)numDataset); + uint32_t nDataset = ie - is; + + if (dtype == CUDA_R_32F) { + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (float*)dataset + (is * dimCenters), + sizeof(float) * nDataset * dimCenters, + kind)); + } else if (dtype == CUDA_R_8U) { + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (uint8_t*)dataset + (is * dimCenters), + sizeof(uint8_t) * nDataset * dimCenters, + kind)); + } else if (dtype == CUDA_R_8I) { + RAFT_CUDA_TRY(cudaMemcpy(bufDataset, + (int8_t*)dataset + (is * dimCenters), + sizeof(int8_t) * nDataset * dimCenters, + kind)); + } + + if (dtype == CUDA_R_32F) { +#if 0 + _cuann_copy(nDataset, dimCenters, + (const float*)bufDataset, dimCenters, + curDataset, dimCenters); +#else + // No need to copy when dtype is CUDA_R_32F + curDataset = (float*)bufDataset; +#endif + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + _cuann_copy(nDataset, + dimCenters, + (const uint8_t*)bufDataset, + dimCenters, + curDataset, + dimCenters, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + _cuann_copy(nDataset, + dimCenters, + (const int8_t*)bufDataset, + dimCenters, + curDataset, + dimCenters, + divisor); + } + + // predict + stream.synchronize(); + kmeans::predict_float_core(handle, + centers, + numCenters, + dimCenters, + curDataset, + nDataset, + labels + is, + metric, + stream, + device_memory); + stream.synchronize(); + + if ((tempCenters != NULL) && (clusterSize != NULL)) { + // accumulate + utils::accumulate_into_selected( + nDataset, dimCenters, tempCenters, clusterSize, curDataset, labels + is, stream); + } + } + + if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) { + _cuann_kmeans_update_centers(centers, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + metric, + clusterSize, + tempCenters); + } + + if (_workspace == NULL) { RAFT_CUDA_TRY(cudaFree(workspace)); } +} + +// +// predict label of dataset with multiple devices +// +inline void _cuann_kmeans_predict_MP(const handle_t& handle, + float* clusterCenters, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + distance::DistanceType metric, + bool isCenterSet, + uint32_t* clusterSize, // [numCenters] + bool updateCenter // If true, cluster Centers will be updated. +) +{ + int numDevices = 1; + // [numDevices][numCenters, dimCenters] + float** clusterCentersCopy = _cuann_multi_device_malloc( + numDevices, numCenters * dimCenters, "clusterCentersCopy", true /* use cudaMalloc() */); + + // [numDevices][numCenters, dimCenters] + float** clusterCentersMP = + _cuann_multi_device_malloc(numDevices, numCenters * dimCenters, "clusterCentersMP"); + + // [numDevices][numCenters] + uint32_t** clusterSizeMP = + _cuann_multi_device_malloc(numDevices, numCenters, "clusterSizeMP"); + + // [numDevices][...] + size_t sizePredictWorkspace = + _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); + void** predictWorkspaceMP = (void**)_cuann_multi_device_malloc( + numDevices, sizePredictWorkspace, "predictWorkspaceMP"); + + int orgDevId; + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); +#pragma omp parallel num_threads(numDevices) + { + int devId = omp_get_thread_num(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaMemcpy(clusterCentersCopy[devId], + clusterCenters, + sizeof(float) * numCenters * dimCenters, + cudaMemcpyDefault)); + uint64_t d0 = (uint64_t)numDataset * (devId) / numDevices; + uint64_t d1 = (uint64_t)numDataset * (devId + 1) / numDevices; + uint64_t nDataset = d1 - d0; + void* ptrDataset; + if (dtype == CUDA_R_32F) { + ptrDataset = (void*)((float*)dataset + (uint64_t)dimCenters * d0); + } else if (dtype == CUDA_R_8U) { + ptrDataset = (void*)((uint8_t*)dataset + (uint64_t)dimCenters * d0); + } else if (dtype == CUDA_R_8I) { + ptrDataset = (void*)((int8_t*)dataset + (uint64_t)dimCenters * d0); + } + _cuann_kmeans_predict(handle, + clusterCentersCopy[devId], + numCenters, + dimCenters, + ptrDataset, + dtype, + nDataset, + labels + d0, + metric, + isCenterSet, + predictWorkspaceMP[devId], + clusterCentersMP[devId], + clusterSizeMP[devId], + false /* do not update centers */); + } + for (int devId = 0; devId < numDevices; devId++) { + // Barrier + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + } + RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); + auto stream = handle.get_stream(); + if (clusterSize != NULL) { + // Reduce results to main thread + utils::memzero(clusterSize, numCenters, stream); + handle.sync_stream(stream); + for (int devId = 0; devId < numDevices; devId++) { + _cuann_axpy(numCenters, 1, clusterSizeMP[devId], clusterSize); + if (devId != orgDevId) { + _cuann_axpy( + numCenters * dimCenters, 1, clusterCentersMP[devId], clusterCentersMP[orgDevId]); + } + } + if (updateCenter) { + _cuann_kmeans_update_centers(clusterCenters, + numCenters, + dimCenters, + dataset, + dtype, + numDataset, + labels, + metric, + clusterSize, + clusterCentersMP[orgDevId]); + } + } + + _cuann_multi_device_free(clusterCentersCopy, numDevices); + _cuann_multi_device_free(clusterCentersMP, numDevices); + _cuann_multi_device_free(clusterSizeMP, numDevices); + _cuann_multi_device_free((uint8_t**)predictWorkspaceMP, numDevices); +} + +// predict labe of dataset (naive CPU version). +// (*) available only for prediction, but not for training. +inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* dataset, // [numDataset, dimCenters] + cudaDataType_t dtype, + uint32_t numDataset, + uint32_t* labels, // [numDataset] + distance::DistanceType metric) +{ + float multiplier = 1.0; + if (dtype == CUDA_R_8U) { + multiplier = 1.0 / 256.0; + } else if (dtype == CUDA_R_8I) { + multiplier = 1.0 / 128.0; + } + for (uint32_t i = 0; i < numDataset; i++) { + float* vector = (float*)malloc(sizeof(float) * dimCenters); + for (uint32_t j = 0; j < dimCenters; j++) { + if (dtype == CUDA_R_32F) { + vector[j] = ((float*)dataset)[j + (dimCenters * i)]; + } else if (dtype == CUDA_R_8U) { + vector[j] = ((uint8_t*)dataset)[j + (dimCenters * i)]; + vector[j] *= multiplier; + } else if (dtype == CUDA_R_8I) { + vector[j] = ((int8_t*)dataset)[j + (dimCenters * i)]; + vector[j] *= multiplier; + } + } + float best_score; + for (uint32_t l = 0; l < numCenters; l++) { + float score = 0.0; + for (uint32_t j = 0; j < dimCenters; j++) { + if (metric == distance::DistanceType::InnerProduct) { + score -= vector[j] * centers[j + (dimCenters * l)]; + } else { + float diff = vector[j] - centers[j + (dimCenters * l)]; + score += diff * diff; + } + } + if ((l == 0) || (score < best_score)) { + labels[i] = l; + best_score = score; + } + } + free(vector); + } +} + +#define R_FACTOR 8 + +// +template +__global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const void* _dataset, // [numDataet, dimCenters] + uint32_t numDataset, + const uint32_t* labels, // [numDataset] + distance::DistanceType metric, + const uint32_t* clusterSize, // [numCenters] + float threshold, + uint32_t average, + uint32_t ofst, + uint32_t* count) +{ + const T* dataset = (const T*)_dataset; + float divisor = (float)_divisor; + uint32_t l = threadIdx.y + blockDim.y * blockIdx.y; + if (l >= numCenters) return; + if (clusterSize[l] > (int)(average * threshold)) return; + + uint32_t laneId = threadIdx.x % 32; + uint32_t i; + if (laneId == 0) { + do { + uint32_t old = atomicAdd(count, 1); + i = (ofst * (old + 1)) % numDataset; + } while (clusterSize[labels[i]] < average); + } + i = __shfl_sync(0xffffffff, i, 0); + uint32_t li = labels[i]; + float sqsum = 0.0; + for (uint32_t j = laneId; j < dimCenters; j += 32) { + float val = centers[j + (uint64_t)dimCenters * li] * (R_FACTOR - 1); + val += (float)(dataset[j + (uint64_t)dimCenters * i]) / divisor; + val /= R_FACTOR; + sqsum += val * val; + centers[j + (uint64_t)dimCenters * l] = val; + } + if (metric == distance::DistanceType::InnerProduct) { + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); + sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); + sqsum = sqrt(sqsum); + for (uint32_t j = laneId; j < dimCenters; j += 32) { + centers[j + ((uint64_t)dimCenters * l)] /= sqsum; + } + } +} + +/** + * end of kmeans + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * Start of topk + */ + +// +#define NUM_THREADS 1024 // DO NOT CHANGE +#define STATE_BIT_LENGTH 8 // 0: state not used, 8: state used +#define MAX_VEC_LENGTH 8 // 1, 2, 4 or 8 + +// +__device__ inline uint32_t convert(uint32_t x) +{ + if (x & 0x80000000) { + return x ^ 0xffffffff; + } else { + return x ^ 0x80000000; + } +} + +// +struct u32_vector { + uint1 x1; + uint2 x2; + uint4 x4; + ulonglong4 x8; +}; + +// +template +__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) +{ + if (vecLen == 1) { + vec.x1 = ((uint1*)(x + i))[0]; + } else if (vecLen == 2) { + vec.x2 = ((uint2*)(x + i))[0]; + } else if (vecLen == 4) { + vec.x4 = ((uint4*)(x + i))[0]; + } else if (vecLen == 8) { + vec.x8 = ((ulonglong4*)(x + i))[0]; + } +} + +// +template +__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) +{ + uint32_t xi; + if (vecLen == 1) { + xi = convert(vec.x1.x); + } else if (vecLen == 2) { + if (i == 0) + xi = convert(vec.x2.x); + else + xi = convert(vec.x2.y); + } else if (vecLen == 4) { + if (i == 0) + xi = convert(vec.x4.x); + else if (i == 1) + xi = convert(vec.x4.y); + else if (i == 2) + xi = convert(vec.x4.z); + else + xi = convert(vec.x4.w); + } else if (vecLen == 8) { + if (i == 0) + xi = convert((uint32_t)(vec.x8.x & 0xffffffff)); + else if (i == 1) + xi = convert((uint32_t)(vec.x8.x >> 32)); + else if (i == 2) + xi = convert((uint32_t)(vec.x8.y & 0xffffffff)); + else if (i == 3) + xi = convert((uint32_t)(vec.x8.y >> 32)); + else if (i == 4) + xi = convert((uint32_t)(vec.x8.z & 0xffffffff)); + else if (i == 5) + xi = convert((uint32_t)(vec.x8.z >> 32)); + else if (i == 6) + xi = convert((uint32_t)(vec.x8.w & 0xffffffff)); + else + xi = convert((uint32_t)(vec.x8.w >> 32)); + } + return xi; +} + +// +template +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ + void kern_topk_cg_11(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch,] + const uint32_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels, // [size_batch, topk,] + uint32_t* _count // [size_batch, 5 * 1024,] + ) +{ + __shared__ uint32_t smem[2048 + 6]; + uint32_t* best_index = &(smem[2048]); + uint32_t* best_csum = &(smem[2048 + 3]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x * gridDim.x; + uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); + + const uint32_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 6) { smem[2048 + threadIdx.x] = 0; } + + uint32_t* count = _count + (5 * 1024 * i_batch); + for (int i = thread_id; i < 5 * 1024; i += num_threads) { + count[i] = 0; + } + cg::sync(grid); + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (21 - 11 * j); + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 2048) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 2048) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } + } + cg::sync(grid); + + constexpr int n_data = 2048 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] >= topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + } + + { + uint32_t j = 2; + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold); // 0 <= k + if (k >= 1024) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 1024) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } + } + cg::sync(grid); + + constexpr int n_data = 1024 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] >= topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += best_index[j]; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector vec; + load_u32_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[2048] < topk)) { + if (count_below + atomicAdd(&count[2048], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } +} + +// +template +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ + void kern_topk_cta_11(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch, max_len_x,] + const uint32_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels // [size_batch, topk,] + ) +{ + __shared__ uint32_t smem[2048 + 3 + 3 + 2]; + uint32_t* best_index = &(smem[2048]); + uint32_t* best_csum = &(smem[2048 + 3]); + uint32_t* count = &(smem[2048 + 6]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x; + uint32_t thread_id = threadIdx.x; + + const uint32_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 8) { smem[2048 + threadIdx.x] = 0; } + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (21 - 11 * j); + for (int i = threadIdx.x; i < 2048; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 2048) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 2048) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + constexpr int n_data = 2048 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = smem[i + (n_data * threadIdx.x)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] > topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + if (count_below == topk) break; + } + + { + uint32_t j = 2; + for (int i = threadIdx.x; i < 1024; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector x_vec; + load_u32_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold); // 0 <= k + if (k >= 1024) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 1024) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + constexpr int n_data = 1024 / blockDim_x; + uint32_t csum[n_data]; +#pragma unroll + for (int i = 0; i < n_data; i++) { + csum[i] = smem[i + (n_data * threadIdx.x)]; + } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + +#pragma unroll + for (int i = n_data - 1; i >= 0; i--) { + if (count_below + csum[i] > topk) continue; + uint32_t index = i + (n_data * threadIdx.x); + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[i]); + break; + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += best_index[j]; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u32_vector vec; + load_u32_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u32_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[1] < topk)) { + if (count_below + atomicAdd(&count[1], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } +} + +// +__device__ inline uint16_t convert(uint16_t x) +{ + if (x & 0x8000) { + return x ^ 0xffff; + } else { + return x ^ 0x8000; + } +} + +// +struct u16_vector { + ushort1 x1; + ushort2 x2; + ushort4 x4; + uint4 x8; +}; + +// +template +__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) +{ + if (vecLen == 1) { + vec.x1 = ((ushort1*)(x + i))[0]; + } else if (vecLen == 2) { + vec.x2 = ((ushort2*)(x + i))[0]; + } else if (vecLen == 4) { + vec.x4 = ((ushort4*)(x + i))[0]; + } else if (vecLen == 8) { + vec.x8 = ((uint4*)(x + i))[0]; + } +} + +// +template +__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) +{ + uint16_t xi; + if (vecLen == 1) { + xi = convert(vec.x1.x); + } else if (vecLen == 2) { + if (i == 0) + xi = convert(vec.x2.x); + else + xi = convert(vec.x2.y); + } else if (vecLen == 4) { + if (i == 0) + xi = convert(vec.x4.x); + else if (i == 1) + xi = convert(vec.x4.y); + else if (i == 2) + xi = convert(vec.x4.z); + else + xi = convert(vec.x4.w); + } else if (vecLen == 8) { + if (i == 0) + xi = convert((uint16_t)(vec.x8.x & 0xffff)); + else if (i == 1) + xi = convert((uint16_t)(vec.x8.x >> 16)); + else if (i == 2) + xi = convert((uint16_t)(vec.x8.y & 0xffff)); + else if (i == 3) + xi = convert((uint16_t)(vec.x8.y >> 16)); + else if (i == 4) + xi = convert((uint16_t)(vec.x8.z & 0xffff)); + else if (i == 5) + xi = convert((uint16_t)(vec.x8.z >> 16)); + else if (i == 6) + xi = convert((uint16_t)(vec.x8.w & 0xffff)); + else + xi = convert((uint16_t)(vec.x8.w >> 16)); + } + return xi; +} + +// +template +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ + void kern_topk_cg_8(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch,] + const uint16_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels, // [size_batch, topk,] + uint32_t* _count // [size_batch, 5 * 1024,] + ) +{ + __shared__ uint32_t smem[256 + 4]; + uint32_t* best_index = &(smem[256]); + uint32_t* best_csum = &(smem[256 + 2]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x * gridDim.x; + uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); + + const uint16_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 4) { smem[256 + threadIdx.x] = 0; } + + uint32_t* count = _count + (2 * 256 * i_batch); + for (int i = thread_id; i < 2 * 256; i += num_threads) { + count[i] = 0; + } + cg::sync(grid); + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (8 - 8 * j); + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector x_vec; + load_u16_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u16_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 256) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 256) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + if (smem[i] > 0) { atomicAdd(&(count[i + (256 * j)]), smem[i]); } + } + cg::sync(grid); + + uint32_t csum[1]; + csum[0] = 0; + if (threadIdx.x < 256) { csum[0] = count[threadIdx.x + (256 * j)]; } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + + if (threadIdx.x < 256) { + if (count_below + csum[0] < topk) { + uint32_t index = threadIdx.x; + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[0]); + } + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector vec; + load_u16_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u16_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[256] < topk)) { + if (count_below + atomicAdd(&count[256], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } +} + +// +template +__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ + void kern_topk_cta_8(uint32_t topk, + uint32_t size_batch, + uint32_t max_len_x, + uint32_t* len_x, // [size_batch, max_len_x,] + const uint16_t* _x, // [size_batch, max_len_x,] + uint8_t* _state, // [size_batch, max_len_x / 8,] + uint32_t* _labels // [size_batch, topk,] + ) +{ + __shared__ uint32_t smem[256 + 6]; + uint32_t* best_index = &(smem[256]); + uint32_t* best_csum = &(smem[256 + 2]); + uint32_t* count = &(smem[256 + 4]); + typedef BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp_storage; + uint32_t i_batch = blockIdx.y; + if (i_batch >= size_batch) return; + + uint32_t nx; + if (len_x == NULL) { + nx = max_len_x; + } else { + nx = len_x[i_batch]; + } + + uint32_t num_threads = blockDim_x; + uint32_t thread_id = threadIdx.x; + + const uint16_t* x = _x + (max_len_x * i_batch); + uint8_t* state = NULL; + if (stateBitLen == 8) { + uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + state = _state + (numState_perThread * num_threads * i_batch); + } + uint32_t* labels = _labels + (topk * i_batch); + if (threadIdx.x < 6) { smem[256 + threadIdx.x] = 0; } + + uint32_t count_below = 0; + uint32_t threshold = 0; + + // + // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". + // + for (int j = 0; j < 2; j += 1) { + uint32_t shift = (8 - 8 * j); + for (int i = threadIdx.x; i < 256; i += blockDim_x) { + smem[i] = 0; + } + __syncthreads(); + + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector x_vec; + load_u16_vector(x_vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + uint32_t xi = get_element_from_u16_vector(x_vec, u); + if (xi < threshold) { + if (stateBitLen == 8) { + labels[atomicAdd(&count[0], 1)] = ivu; + iState |= mask; + } + } else { + uint32_t k = (xi - threshold) >> shift; // 0 <= k + if (k >= 256) { + if (stateBitLen == 8) { iState |= mask; } + } else if (k + 1 < 256) { + atomicAdd(&(smem[k + 1]), 1); + } + } + } + } + if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } + } + __syncthreads(); + + uint32_t csum[1]; + if (threadIdx.x < 256) { csum[0] = smem[threadIdx.x]; } + BlockScanT(temp_storage).InclusiveSum(csum, csum); + + if (threadIdx.x < 256) { + if (count_below + csum[0] < topk) { + uint32_t index = threadIdx.x; + atomicMax(&(best_index[j]), index); + atomicMax(&(best_csum[j]), csum[0]); + } + } + __syncthreads(); + + count_below += best_csum[j]; + threshold += (best_index[j] << shift); + if (count_below == topk) break; + } + + // + // Get labels that satifies "x[i] < threshold". + // + int ii = 0; + for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { + uint8_t iState = 0; + if (stateBitLen == 8) { + iState = state[thread_id + (num_threads * ii)]; + if (iState == (uint8_t)0xff) continue; + } +#pragma unroll + for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { + int iv = i + (num_threads * v); + if (iv >= nx) break; + + struct u16_vector vec; + load_u16_vector(vec, x, iv); +#pragma unroll + for (int u = 0; u < vecLen; u++) { + int ivu = iv + u; + if (ivu >= nx) break; + + uint8_t mask = (uint8_t)0x1 << (v + u); + if ((stateBitLen == 8) && (iState & mask)) continue; + uint32_t xi = get_element_from_u16_vector(vec, u); + if (xi < threshold) { + labels[atomicAdd(&count[0], 1)] = ivu; + } else if ((xi == threshold) && (count_below + count[1] < topk)) { + if (count_below + atomicAdd(&count[1], 1) < topk) { + labels[atomicAdd(&count[0], 1)] = ivu; + } + } + } + } + } +} + +// +__global__ void _sort_topk_prep(uint32_t sizeBatch, + uint32_t topK, + uint32_t maxSamples, + const uint32_t* labels, // [sizeBatch, topK] + const float* samples, // [sizeBatch, maxSamples] + int* offsets, // [sizeBatch + 1] + float* outputs // [sizeBatch, topK] +) +{ + uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid < sizeBatch + 1) { offsets[tid] = tid * topK; } + if (tid < sizeBatch * topK) { + uint32_t label = labels[tid]; + uint32_t iBatch = tid / topK; + float value = samples[label + (maxSamples * iBatch)]; + outputs[tid] = value; + } +} + +// +inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + cudaDataType_t sampleDtype = CUDA_R_32F) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + static_assert(stateBitLen == 0 || stateBitLen == 8); + + size_t workspaceSize = 0; + // count + if (sampleDtype == CUDA_R_16F) { + workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); + } else { + workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); + } + // state + if (stateBitLen == 8) { + // (*) Each thread has at least one array element for state + uint32_t numBlocks_perBatch = (getMultiProcessorCount() * 2 + sizeBatch) / sizeBatch; + + uint32_t numThreads_perBatch = numThreads * numBlocks_perBatch; + uint32_t numSample_perThread = (maxSamples + numThreads_perBatch - 1) / numThreads_perBatch; + uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; + workspaceSize += + Pow2<128>::roundUp(sizeof(uint8_t) * numState_perThread * numThreads_perBatch * sizeBatch); + } + + size_t workspaceSize2 = 0; + // offsets + workspaceSize2 += Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1)); + // keys_in, keys_out, values_out + workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); + workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); + workspaceSize2 += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK); + // cub_ws + size_t cub_ws_size = 0; + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + cub_ws_size, + (float*)NULL, + (float*)NULL, + (uint32_t*)NULL, + (uint32_t*)NULL, + sizeBatch * topK, + sizeBatch, + (int*)NULL, + (int*)NULL); + workspaceSize2 += Pow2<128>::roundUp(cub_ws_size); + workspaceSize = max(workspaceSize, workspaceSize2); + + return workspaceSize; +} + +// +int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH) +{ + int vecLen = min(maxVecLen, MAX_VEC_LENGTH); + while ((maxSamples % vecLen) != 0) { + vecLen /= 2; + } + return vecLen; +} + +// +inline void _cuann_find_topk(const handle_t& handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const float* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + static_assert(stateBitLen == 0 || stateBitLen == 8); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + RAFT_CUDA_TRY( + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); +#endif + + // Limit the maximum value of vecLen to 4. In the case of FP32, + // setting vecLen = 8 in cg_kernel causes too much register usage. + int vecLen = _get_vecLen(maxSamples, 4); + void* cg_kernel; + if (vecLen == 4) { + cg_kernel = (void*)kern_topk_cg_11; + } else if (vecLen == 2) { + cg_kernel = (void*)kern_topk_cg_11; + } else if (vecLen == 1) { + cg_kernel = (void*)kern_topk_cg_11; + } + + int numBlocksPerSm_topk; + size_t dynamicSMemSize = 0; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize)); + int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); + int numBlocks = + min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + if (maxSamples <= numThreads * 10) { + // When number of sample is small, using multiple thread-blocks does not + // improve performance, in which case cta_kernel is used. Tentatively, + // "numThreads * 10" is used as the threshold, but this may be better + // determined by auto-tuning, etc. + numBlocks_perBatch = 1; + } + uint32_t* count = (uint32_t*)workspace; + uint8_t* state = NULL; + if (stateBitLen == 8) { + state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); + } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(numBlocks_perBatch, sizeBatch, 1); + if (numBlocks_perBatch <= 1) { + void (*cta_kernel)( + uint32_t, uint32_t, uint32_t, uint32_t*, const uint32_t*, uint8_t*, uint32_t*); + int vecLen = _get_vecLen(maxSamples); + if (vecLen == 8) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 4) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 2) { + cta_kernel = kern_topk_cta_11; + } else if (vecLen == 1) { + cta_kernel = kern_topk_cta_11; + } + cta_kernel<<>>( + topK, sizeBatch, maxSamples, numSamples, (const uint32_t*)samples, state, labels); + } else { + void* args[9]; + args[0] = {&(topK)}; + args[1] = {&(sizeBatch)}; + args[2] = {&(maxSamples)}; + args[3] = {&(numSamples)}; + args[4] = {&(samples)}; + args[5] = {&(state)}; + args[6] = {&(labels)}; + args[7] = {&(count)}; + args[8] = {nullptr}; + RAFT_CUDA_TRY( + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); + } + if (!sort) { return; } + + // offsets: [sizeBatch + 1] + // keys_in, keys_out, values_out: [sizeBatch, topK] + int* offsets = (int*)workspace; + float* keys_in = (float*)((uint8_t*)offsets + Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1))); + float* keys_out = + (float*)((uint8_t*)keys_in + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); + uint32_t* values_out = + (uint32_t*)((uint8_t*)keys_out + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); + void* cub_ws = + (void*)((uint8_t*)values_out + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK)); + + dim3 stpThreads(128, 1, 1); + dim3 stpBlocks((max(sizeBatch + 1, sizeBatch * topK) + stpThreads.x - 1) / stpThreads.x, 1, 1); + _sort_topk_prep<<>>( + sizeBatch, topK, maxSamples, labels, samples, offsets, keys_in); + + size_t cub_ws_size = 0; + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + cub_ws_size, + keys_in, + keys_out, + labels, + values_out, + sizeBatch * topK, + sizeBatch, + offsets, + offsets + 1); + + cub::DeviceSegmentedRadixSort::SortPairs(cub_ws, + cub_ws_size, + keys_in, + keys_out, + labels, + values_out, + sizeBatch * topK, + sizeBatch, + offsets, + offsets + 1, + (int)0, + (int)(sizeof(float) * 8), + handle.get_stream()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(labels, + values_out, + sizeof(uint32_t) * sizeBatch * topK, + cudaMemcpyDeviceToDevice, + handle.get_stream())); +} + +// +inline void _cuann_find_topk(const handle_t& handle, + uint32_t topK, + uint32_t sizeBatch, + uint32_t maxSamples, + uint32_t* numSamples, // [sizeBatch,] + const half* samples, // [sizeBatch, maxSamples,] + uint32_t* labels, // [sizeBatch, topK,] + void* workspace, + bool sort = false) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + static_assert(stateBitLen == 0 || stateBitLen == 8); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + RAFT_CUDA_TRY( + cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); +#endif + + int vecLen = _get_vecLen(maxSamples); + void* cg_kernel; + if (vecLen == 8) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 4) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 2) { + cg_kernel = (void*)kern_topk_cg_8; + } else if (vecLen == 1) { + cg_kernel = (void*)kern_topk_cg_8; + } + + int numBlocksPerSm_topk; + RAFT_CUDA_TRY( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0)); + int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); + int numBlocks = + min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); + numBlocks_perBatch = max(numBlocks / sizeBatch, 1); + if (maxSamples <= numThreads * 10) { + // When number of sample is small, using multiple thread-blocks does not + // improve performance, in which case cta_kernel is used. Tentatively, + // "numThreads * 10" is used as the threshold, but this may be better + // determined by auto-tuning, etc. + numBlocks_perBatch = 1; + } + uint32_t* count = (uint32_t*)workspace; + uint8_t* state = NULL; + if (stateBitLen == 8) { + state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); + } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(numBlocks_perBatch, sizeBatch, 1); + if (numBlocks_perBatch <= 1) { + void (*cta_kernel)( + uint32_t, uint32_t, uint32_t, uint32_t*, const uint16_t*, uint8_t*, uint32_t*); + int vecLen = _get_vecLen(maxSamples); + if (vecLen == 8) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 4) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 2) { + cta_kernel = kern_topk_cta_8; + } else if (vecLen == 1) { + cta_kernel = kern_topk_cta_8; + } + cta_kernel<<>>( + topK, sizeBatch, maxSamples, numSamples, (const uint16_t*)samples, state, labels); + } else { + void* args[9]; + args[0] = {&(topK)}; + args[1] = {&(sizeBatch)}; + args[2] = {&(maxSamples)}; + args[3] = {&(numSamples)}; + args[4] = {&(samples)}; + args[5] = {&(state)}; + args[6] = {&(labels)}; + args[7] = {&(count)}; + args[8] = {nullptr}; + RAFT_CUDA_TRY( + cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); + } +} + +/** + * + * End of topk + * + * + * + * + * + * + * + * + * + * + * Start of ivfpq + */ + +// +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc); + +// search +template +inline void ivfpq_search(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimDataset] + const float* pqCenters, // [dimPq, 256, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* cluster_offsets, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [dimDataset] + uint64_t* topKNeighbors, // [topK] + float* topKDistances, // [topK] + void* workspace); + +inline void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +); + +// +bool manage_local_topk(cuannIvfPqDescriptor_t& desc); +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads); + +// +__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] + float initValue, + uint32_t num); + +// +__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList); + +// +__global__ void ivfpq_make_chunk_index_ptr( + uint32_t numProbes, + uint32_t sizeBatch, + const uint32_t* cluster_offsets, // [numClusters + 1,] + const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] + uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] + uint32_t* numSamples // [sizeBatch,] +); + +// +template +__global__ void ivfpq_make_outputs(uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const scoreDtype* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +// +__device__ inline uint32_t warp_scan(uint32_t x) +{ + uint32_t y; + y = __shfl_up_sync(0xffffffff, x, 1); + if (threadIdx.x % 32 >= 1) x += y; + y = __shfl_up_sync(0xffffffff, x, 2); + if (threadIdx.x % 32 >= 2) x += y; + y = __shfl_up_sync(0xffffffff, x, 4); + if (threadIdx.x % 32 >= 4) x += y; + y = __shfl_up_sync(0xffffffff, x, 8); + if (threadIdx.x % 32 >= 8) x += y; + y = __shfl_up_sync(0xffffffff, x, 16); + if (threadIdx.x % 32 >= 16) x += y; + return x; +} + +// +__device__ inline uint32_t thread_block_scan(uint32_t x, uint32_t* smem) +{ + x = warp_scan(x); + __syncthreads(); + if (threadIdx.x % 32 == 31) { smem[threadIdx.x / 32] = x; } + __syncthreads(); + if (threadIdx.x < 32) { smem[threadIdx.x] = warp_scan(smem[threadIdx.x]); } + __syncthreads(); + if (threadIdx.x / 32 > 0) { x += smem[threadIdx.x / 32 - 1]; } + __syncthreads(); + return x; +} + +// +__global__ void ivfpq_make_chunk_index_ptr( + uint32_t numProbes, + uint32_t sizeBatch, + const uint32_t* cluster_offsets, // [numClusters + 1,] + const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] + uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] + uint32_t* numSamples // [sizeBatch,] +) +{ + __shared__ uint32_t smem_temp[32]; + __shared__ uint32_t smem_base[2]; + + uint32_t iBatch = blockIdx.x; + if (iBatch >= sizeBatch) return; + const uint32_t* clusterLabelsToProbe = _clusterLabelsToProbe + (numProbes * iBatch); + uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + + // + uint32_t j_end = (numProbes + 1024 - 1) / 1024; + for (uint32_t j = 0; j < j_end; j++) { + uint32_t i = threadIdx.x + (1024 * j); + uint32_t val = 0; + if (i < numProbes) { + uint32_t l = clusterLabelsToProbe[i]; + val = cluster_offsets[l + 1] - cluster_offsets[l]; + } + val = thread_block_scan(val, smem_temp); + + if (i < numProbes) { + if (j > 0) { val += smem_base[(j - 1) & 0x1]; } + chunkIndexPtr[i] = val; + if (i == numProbes - 1) { numSamples[iBatch] = val; } + } + + if ((j < j_end - 1) && (threadIdx.x == 1023)) { smem_base[j & 0x1] = val; } + } +} + +// +__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] + float initValue, + uint32_t num) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= num) return; + topkScores[i] = initValue; +} + +// +__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numElement) return; + indexList[i] = i; +} + +// +__device__ inline void ivfpq_get_id_dataset(uint32_t iSample, + uint32_t numProbes, + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* clusterLabels, // [numProbes,] + const uint32_t* chunkIndexPtr, // [numProbes,] + uint32_t& iChunk, + uint32_t& label, + uint32_t& iDataset) +{ + uint32_t minChunk = 0; + uint32_t maxChunk = numProbes - 1; + iChunk = (minChunk + maxChunk) / 2; + while (minChunk < maxChunk) { + if (iSample >= chunkIndexPtr[iChunk]) { + minChunk = iChunk + 1; + } else { + maxChunk = iChunk; + } + iChunk = (minChunk + maxChunk) / 2; + } + + label = clusterLabels[iChunk]; + uint32_t iSampleInChunk = iSample; + if (iChunk > 0) { iSampleInChunk -= chunkIndexPtr[iChunk - 1]; } + iDataset = iSampleInChunk + clusterIndexPtr[label]; +} + +// +template +__global__ void ivfpq_make_outputs(uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const scoreDtype* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= topk) return; + uint32_t iBatch = blockIdx.y; + if (iBatch >= sizeBatch) return; + + uint32_t iSample = topkSampleIds[i + (topk * iBatch)]; + if (scoreTopkIndex == NULL) { + // 0 <= iSample < maxSamples + topkScores[i + (topk * iBatch)] = scores[iSample + (maxSamples * iBatch)]; + uint32_t iChunk; + uint32_t label; + uint32_t iDataset; + ivfpq_get_id_dataset(iSample, + numProbes, + clusterIndexPtr, + clusterLabels + (numProbes * iBatch), + chunkIndexPtr + (numProbes * iBatch), + iChunk, + label, + iDataset); + topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; + } else { + // 0 <= iSample < (numProbes * topk) + topkScores[i + (topk * iBatch)] = scores[iSample + ((numProbes * topk) * iBatch)]; + uint32_t iDataset = scoreTopkIndex[iSample + ((numProbes * topk) * iBatch)]; + topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; + } +} + +// +inline bool manage_local_topk(cuannIvfPqDescriptor_t& desc) +{ + int depth = (desc->topK + 31) / 32; + if (depth > 4) { return false; } + if (desc->numProbes < 16) { return false; } + if (desc->maxBatchSize * desc->numProbes < 256) { return false; } + return true; +} + +// +inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads) +{ + if (manage_local_topk(desc)) { + int topk_32 = (desc->topK + 31) / 32; + return (sizeof(float) + sizeof(uint32_t)) * (numThreads / 2) * topk_32; + } + return 0; +} + +// return workspace size +inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc) +{ + size_t size = 0; + // clusterLabelsOut [maxBatchSize, numProbes] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // indexList [maxBatchSize * numProbes] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // indexListSorted [maxBatchSize * numProbes] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // numSamples [maxBatchSize,] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize); + // cubWorkspace + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + uint32_t* d_keys_in = NULL; + uint32_t* d_keys_out = NULL; + uint32_t* d_values_in = NULL; + uint32_t* d_values_out = NULL; + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + desc->maxBatchSize * desc->numProbes); + desc->sizeCubWorkspace = Pow2<128>::roundUp(temp_storage_bytes); + size += desc->sizeCubWorkspace; + // chunkIndexPtr [maxBatchSize, numProbes] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); + // topkSids [maxBatchSize, topk] + size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK); + // similarity + size_t unit_size = sizeof(float); + if (desc->internalDistanceDtype == CUDA_R_16F) { unit_size = sizeof(half); } + if (manage_local_topk(desc)) { + // [matBatchSize, numProbes, topK] + size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->numProbes * desc->topK); + } else { + // [matBatchSize, maxSamples] + size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->maxSamples); + } + // simTopkIndex + if (manage_local_topk(desc)) { + // [matBatchSize, numProbes, topk] + size += + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); + } + // topkScores + if (manage_local_topk(desc)) { + // [maxBatchSize, topk] + size += Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK); + } + // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] + size += + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); + // topkWorkspace + if (manage_local_topk(desc)) { + size += _cuann_find_topk_bufferSize(handle, + desc->topK, + desc->maxBatchSize, + desc->numProbes * desc->topK, + desc->internalDistanceDtype); + } else { + size += _cuann_find_topk_bufferSize( + handle, desc->topK, desc->maxBatchSize, desc->maxSamples, desc->internalDistanceDtype); + } + return size; +} + +// +__device__ __host__ inline void ivfpq_encode_core( + uint32_t ldDataset, uint32_t dimPq, uint32_t bitPq, const uint32_t* label, uint8_t* output) +{ + for (uint32_t j = 0; j < dimPq; j++) { + uint8_t code = label[(ldDataset * j)]; + if (bitPq == 8) { + uint8_t* ptrOutput = output + j; + ptrOutput[0] = code; + } else if (bitPq == 7) { + uint8_t* ptrOutput = output + 7 * (j / 8); + if (j % 8 == 0) { + ptrOutput[0] |= code; + } else if (j % 8 == 1) { + ptrOutput[0] |= code << 7; + ptrOutput[1] |= code >> 1; + } else if (j % 8 == 2) { + ptrOutput[1] |= code << 6; + ptrOutput[2] |= code >> 2; + } else if (j % 8 == 3) { + ptrOutput[2] |= code << 5; + ptrOutput[3] |= code >> 3; + } else if (j % 8 == 4) { + ptrOutput[3] |= code << 4; + ptrOutput[4] |= code >> 4; + } else if (j % 8 == 5) { + ptrOutput[4] |= code << 3; + ptrOutput[5] |= code >> 5; + } else if (j % 8 == 6) { + ptrOutput[5] |= code << 2; + ptrOutput[6] |= code >> 6; + } else if (j % 8 == 7) { + ptrOutput[6] |= code << 1; + } + } else if (bitPq == 6) { + uint8_t* ptrOutput = output + 3 * (j / 4); + if (j % 4 == 0) { + ptrOutput[0] |= code; + } else if (j % 4 == 1) { + ptrOutput[0] |= code << 6; + ptrOutput[1] |= code >> 2; + } else if (j % 4 == 2) { + ptrOutput[1] |= code << 4; + ptrOutput[2] |= code >> 4; + } else if (j % 4 == 3) { + ptrOutput[2] |= code << 2; + } + } else if (bitPq == 5) { + uint8_t* ptrOutput = output + 5 * (j / 8); + if (j % 8 == 0) { + ptrOutput[0] |= code; + } else if (j % 8 == 1) { + ptrOutput[0] |= code << 5; + ptrOutput[1] |= code >> 3; + } else if (j % 8 == 2) { + ptrOutput[1] |= code << 2; + } else if (j % 8 == 3) { + ptrOutput[1] |= code << 7; + ptrOutput[2] |= code >> 1; + } else if (j % 8 == 4) { + ptrOutput[2] |= code << 4; + ptrOutput[3] |= code >> 4; + } else if (j % 8 == 5) { + ptrOutput[3] |= code << 1; + } else if (j % 8 == 6) { + ptrOutput[3] |= code << 6; + ptrOutput[4] |= code >> 2; + } else if (j % 8 == 7) { + ptrOutput[4] |= code << 3; + } + } else if (bitPq == 4) { + uint8_t* ptrOutput = output + (j / 2); + if (j % 2 == 0) { + ptrOutput[0] |= code; + } else { + ptrOutput[0] |= code << 4; + } + } + } +} + +// +__global__ void ivfpq_encode_kernel(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numDataset) return; + ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); +} + +// +inline void ivfpq_encode(uint32_t numDataset, + uint32_t ldDataset, // (*) ldDataset >= numDataset + uint32_t dimPq, + uint32_t bitPq, // 4 <= bitPq <= 8 + const uint32_t* label, // [dimPq, ldDataset] + uint8_t* output // [numDataset, dimPq] +) +{ +#if 1 + // GPU + dim3 iekThreads(128, 1, 1); + dim3 iekBlocks((numDataset + iekThreads.x - 1) / iekThreads.x, 1, 1); + ivfpq_encode_kernel<<>>( + numDataset, ldDataset, dimPq, bitPq, label, output); +#else + // CPU + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + for (uint32_t i = 0; i < numDataset; i++) { + ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); + } +#endif +} + +// +template __global__ void ivfpq_make_outputs( + uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const float* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +// +template __global__ void ivfpq_make_outputs( + uint32_t numProbes, + uint32_t topk, + uint32_t maxSamples, + uint32_t sizeBatch, + const uint32_t* clusterIndexPtr, // [numClusters + 1] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterLabels, // [sizeBatch, numProbes] + const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] + const half* scores, // [sizeBatch, maxSamples] or + // [sizeBatch, numProbes, topk] + const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] + const uint32_t* topkSampleIds, // [sizeBatch, topk] + uint64_t* topkNeighbors, // [sizeBatch, topk] + float* topkScores // [sizeBatch, topk] +); + +/** + * End of ivfpq + * + * + * + * + */ + +inline void cuannIvfPqSetIndexParameters( + cuannIvfPqDescriptor_t& desc, + const uint32_t numClusters, /* Number of clusters */ + const uint32_t numDataset, /* Number of dataset entries */ + const uint32_t dimDataset, /* Dimension of each entry */ + const uint32_t dimPq, /* Dimension of each entry after product quantization */ + const uint32_t bitPq, /* Bit length of PQ */ + const distance::DistanceType metric, + const codebook_gen typePqCenter); + +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + distance::DistanceType* metric, + codebook_gen* typePqCenter); + +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, + size_t* size /* bytes of dataset index */); + +inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t& desc) +{ + // [numClusters, dimDatasetExt] + return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimDatasetExt); +} + +inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t& desc) +{ + size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; + if (desc->typePqCenter == codebook_gen::PER_SUBSPACE) { + // [dimPq, 1 << bitPq, lenPq] + return Pow2<128>::roundUp(desc->dimPq * size_base); + } else { + // [numClusters, 1 << bitPq, lenPq] + return Pow2<128>::roundUp(desc->numClusters * size_base); + } +} + +inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t& desc) +{ + // [numDataset, dimPq * bitPq / 8] + return Pow2<128>::roundUp(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); +} + +inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t& desc) +{ + // [numDataset,] + return Pow2<128>::roundUp(sizeof(uint32_t) * desc->numDataset); +} + +inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t& desc) +{ + // [numClusters + 1,] + return Pow2<128>::roundUp(sizeof(uint32_t) * (desc->numClusters + 1)); +} + +inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t& desc) +{ + // [dimDataset, dimRotDataset] + return Pow2<128>::roundUp(sizeof(float) * desc->dimDataset * desc->dimRotDataset); +} + +inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t& desc) +{ + // [numClusters, dimRotDataset] + return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimRotDataset); +} + +inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, + struct cuannIvfPqIndexHeader** header, + float** clusterCenters, // [numClusters, dimDatasetExt] + float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] + uint32_t** originalNumbers, // [numDataset] + uint32_t** cluster_offsets, // [numClusters + 1] + float** rotationMatrix, // [dimDataset, dimRotDataset] + float** clusterRotCenters // [numClusters, dimRotDataset] +) +{ + *header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); + *clusterCenters = (float*)((uint8_t*)(*header) + sizeof(struct cuannIvfPqIndexHeader)); + *pqCenters = (float*)((uint8_t*)(*clusterCenters) + _cuann_getIndexSize_clusterCenters(desc)); + *pqDataset = (uint8_t*)((uint8_t*)(*pqCenters) + _cuann_getIndexSize_pqCenters(desc)); + *originalNumbers = (uint32_t*)((uint8_t*)(*pqDataset) + _cuann_getIndexSize_pqDataset(desc)); + *cluster_offsets = + (uint32_t*)((uint8_t*)(*originalNumbers) + _cuann_getIndexSize_originalNumbers(desc)); + *rotationMatrix = (float*)((uint8_t*)(*cluster_offsets) + _cuann_getIndexSize_indexPtr(desc)); + *clusterRotCenters = + (float*)((uint8_t*)(*rotationMatrix) + _cuann_getIndexSize_rotationMatrix(desc)); +} + +__global__ void kern_get_cluster_size(uint32_t numClusters, + const uint32_t* cluster_offsets, // [numClusters + 1,] + uint32_t* clusterSize // [numClusters,] +) +{ + uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); + if (i >= numClusters) return; + clusterSize[i] = cluster_offsets[i + 1] - cluster_offsets[i]; +} + +template +int descending(const void* a, const void* b) +{ + T valA = ((T*)a)[0]; + T valB = ((T*)b)[0]; + if (valA > valB) return -1; + if (valA < valB) return 1; + return 0; +} + +// (*) This is temporal. Need to be removed in future. +inline void _cuann_get_random_norm_vector(int len, float* vector) +{ + float sqsum = 0.0; + for (int i = 0; i < len; i++) { + vector[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; + sqsum += vector[i] * vector[i]; + } + float norm = sqrt(sqsum); + for (int i = 0; i < len; i++) { + vector[i] /= norm; + } +} + +inline void _cuann_get_inclusiveSumSortedClusterSize( + cuannIvfPqDescriptor_t& desc, + const uint32_t* cluster_offsets, // [numClusters + 1] + float* clusterCenters, // [numClusters, dimDatasetExt] + uint32_t** output // [numClusters] +) +{ + // [CPU] + if (*output != nullptr) { free(*output); } + *output = (uint32_t*)malloc(sizeof(uint32_t) * desc->numClusters); + desc->_numClustersSize0 = 0; + for (uint32_t i = 0; i < desc->numClusters; i++) { + (*output)[i] = cluster_offsets[i + 1] - cluster_offsets[i]; + if ((*output)[i] > 0) continue; + + desc->_numClustersSize0 += 1; + // Work-around for clusters of size 0 + _cuann_get_random_norm_vector(desc->dimDatasetExt, clusterCenters + (desc->dimDatasetExt * i)); + } + RAFT_LOG_DEBUG("Number of clusters of size zero: %d", desc->_numClustersSize0); + // sort + qsort(*output, desc->numClusters, sizeof(uint32_t), descending); + // scan + for (uint32_t i = 1; i < desc->numClusters; i++) { + (*output)[i] += (*output)[i - 1]; + } + RAFT_EXPECTS((*output)[desc->numClusters - 1] == desc->numDataset, "cluster sizes do not add up"); +} + +inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t& desc, + const float* clusterCenters, // [numClusters, dimDataset,] + float** output // [numClusters,] +) +{ + if (*output != NULL) { RAFT_CUDA_TRY(cudaFree(*output)); } + RAFT_CUDA_TRY(cudaMallocManaged(output, sizeof(float) * desc->numClusters)); + switch (utils::check_pointer_residency(clusterCenters, *output)) { + case utils::pointer_residency::device_only: + case utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("_cuann_get_sqsumClusters: not all pointers are available on the device."); + } + rmm::cuda_stream_default.synchronize(); + utils::dots_along_rows( + desc->numClusters, desc->dimDataset, clusterCenters, *output, rmm::cuda_stream_default); + rmm::cuda_stream_default.synchronize(); +} + +// +template +T _cuann_dot(int n, const T* x, int incX, const T* y, int incY) +{ + T val = 0; + for (int i = 0; i < n; i++) { + val += x[incX * i] * y[incY * i]; + } + return val; +} + +// +template +T _cuann_dot(int n, const X* x, int incX, const Y* y, int incY, T divisor = 1) +{ + T val = 0; + for (int i = 0; i < n; i++) { + val += (T)(x[incX * i]) * (T)(y[incY * i]) / divisor; + } + return val; +} + +// +template +T _cuann_rand() +{ + return (T)rand() / RAND_MAX; +} + +// make rotation matrix +inline void _cuann_make_rotation_matrix(uint32_t nRows, + uint32_t nCols, + uint32_t lenPq, + bool randomRotation, + float* rotationMatrix // [nRows, nCols] +) +{ + RAFT_EXPECTS( + nRows >= nCols, "number of rows (%u) must be larger than number or cols (%u)", nRows, nCols); + RAFT_EXPECTS( + nRows % lenPq == 0, "number of rows (%u) must be a multiple of lenPq (%u)", nRows, lenPq); + + if (randomRotation) { + RAFT_LOG_DEBUG("Creating a random rotation matrix."); + double dot, norm; + double* matrix = (double*)malloc(sizeof(double) * nRows * nCols); + memset(matrix, 0, sizeof(double) * nRows * nCols); + for (uint32_t i = 0; i < nRows * nCols; i++) { + matrix[i] = _cuann_rand() - 0.5; + } + for (uint32_t j = 0; j < nCols; j++) { + // normalize the j-th col vector + norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); + for (uint32_t i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] /= norm; + } + // orthogonalize the j-th col vector with the previous col vectors + for (uint32_t k = 0; k < j; k++) { + dot = _cuann_dot(nRows, matrix + j, nCols, matrix + k, nCols); + for (uint32_t i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] -= dot * matrix[k + (nCols * i)]; + } + } + // normalize the j-th col vector again + norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); + for (uint32_t i = 0; i < nRows; i++) { + matrix[j + (nCols * i)] /= norm; + } + } + for (uint32_t i = 0; i < nRows * nCols; i++) { + rotationMatrix[i] = (float)matrix[i]; + } + free(matrix); + } else { + if (nRows == nCols) { + memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); + for (uint32_t i = 0; i < nCols; i++) { + rotationMatrix[i + (nCols * i)] = 1.0; + } + } else { + memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); + uint32_t i = 0; + for (uint32_t j = 0; j < nCols; j++) { + rotationMatrix[j + (nCols * i)] = 1.0; + i += lenPq; + if (i >= nRows) { i = (i % nRows) + 1; } + } + } + } +} + +// show centers (for debuging) +inline void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenters] + uint32_t numCenters, + uint32_t dimCenters, + const uint32_t* centerSize, + const uint32_t numShow = 5) +{ +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + for (uint64_t k = 0; k < numCenters; k++) { + if ((numShow <= k) && (k < numCenters - numShow)) { + if (k == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# centers[%lu]:", k); + for (uint64_t j = 0; j < dimCenters; j++) { + if ((numShow <= j) && (j < dimCenters - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %f,", centers[j + (dimCenters * k)]); + } + fprintf(stderr, " %d\n", centerSize[k]); + } +#endif +} + +// show dataset (for debugging) +inline void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] + uint32_t numDataset, + uint32_t dimDataset, + const uint32_t numShow = 5) +{ +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + for (uint64_t i = 0; i < numDataset; i++) { + if ((numShow <= i) && (i < numDataset - numShow)) { + if (i == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# dataset[%lu]:", i); + for (uint64_t j = 0; j < dimDataset; j++) { + if ((numShow <= j) && (j < dimDataset - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %.3f,", dataset[j + (dimDataset * i)]); + } + fprintf(stderr, "\n"); + } +#endif +} + +// show pq code (for debuging) +inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] + uint32_t numDataset, + uint32_t dimPq, + const uint32_t numShow = 5) +{ +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + for (uint64_t i = 0; i < numDataset; i++) { + if ((numShow <= i) && (i < numDataset - numShow)) { + if (i == numShow) fprintf(stderr, "...\n"); + continue; + } + fprintf(stderr, "# dataset[%lu]:", i); + for (uint64_t j = 0; j < dimPq; j++) { + if ((numShow <= j) && (j < dimPq - numShow)) { + if (j == numShow) fprintf(stderr, " ... "); + continue; + } + fprintf(stderr, " %u,", pqDataset[j + (dimPq * i)]); + } + fprintf(stderr, "\n"); + } +#endif +} + +// +int _cuann_set_device(int devId) +{ + int orgDevId; + RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + return orgDevId; +} + +// +uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) +{ + return min(clusterSize * dimPq, 256 * max(1 << bitPq, dimPq)); +} + +// +template +void _cuann_compute_PQ_code(const handle_t& handle, + uint32_t numDataset, + uint32_t dimDataset, + uint32_t dimRotDataset, + uint32_t dimPq, + uint32_t lenPq, + uint32_t bitPq, + uint32_t numClusters, + codebook_gen typePqCenter, + uint32_t maxClusterSize, + float* clusterCenters, // [numClusters, dimDataset] + const float* rotationMatrix, // [dimRotDataset, dimDataset] + const T* dataset, // [numDataset] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* clusterSize, // [numClusters] + const uint32_t* cluster_offsets, // [numClusters + 1] + float* pqCenters, // [...] + uint32_t numIterations, + uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] +) +{ + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("_cuann_compute_PQ_code: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + // + // Compute PQ code + // + memset(pqDataset, 0, sizeof(uint8_t) * numDataset * dimPq * bitPq / 8); + float** resVectors; // [numDevices][maxClusterSize, dimDataset] + float** rotVectors; // [numDevices][maxClusterSize, dimRotDataset] + float** subVectors; // [numDevices][dimPq, maxClusterSize, lenPq] + uint32_t** subVectorLabels; // [numDevices][dimPq, maxClusterSize] + uint8_t** myPqDataset; // [numDevices][maxCluserSize, dimPq * bitPq / 8] + resVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimDataset, "resVectors"); + rotVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimRotDataset, "rotVectors"); + subVectors = _cuann_multi_device_malloc(1, dimPq * maxClusterSize * lenPq, "subVectors"); + subVectorLabels = + _cuann_multi_device_malloc(1, dimPq * maxClusterSize, "subVectorLabels"); + myPqDataset = + _cuann_multi_device_malloc(1, maxClusterSize * dimPq * bitPq / 8, "myPqDataset"); + + uint32_t maxTrainset = 0; + if ((numIterations > 0) && (typePqCenter == codebook_gen::PER_CLUSTER)) { + maxTrainset = _get_num_trainset(maxClusterSize, dimPq, bitPq); + } + void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( + 1, + _cuann_kmeans_predict_bufferSize((1 << bitPq), lenPq, max(maxClusterSize, maxTrainset)), + "pqPredictWorkspace"); + + uint32_t** rotVectorLabels; // [numDevices][maxClusterSize, dimPq,] + uint32_t** pqClusterSize; // [numDevices][1 << bitPq,] + uint32_t** wsKAC; // [numDevices][1] + float** myPqCenters; // [numDevices][1 << bitPq, lenPq] + float** myPqCentersTemp; // [numDevices][1 << bitPq, lenPq] + if ((numIterations > 0) && (typePqCenter == codebook_gen::PER_CLUSTER)) { + memset(pqCenters, 0, sizeof(float) * numClusters * (1 << bitPq) * lenPq); + rotVectorLabels = + _cuann_multi_device_malloc(1, maxClusterSize * dimPq, "rotVectorLabels"); + pqClusterSize = _cuann_multi_device_malloc(1, (1 << bitPq), "pqClusterSize"); + wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); + myPqCenters = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCenters"); + myPqCentersTemp = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCentersTemp"); + } + +#pragma omp parallel for schedule(dynamic) num_threads(1) + for (uint32_t l = 0; l < numClusters; l++) { + int devId = omp_get_thread_num(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + if (devId == 0) { + fprintf(stderr, "(%s) Making PQ dataset: %u / %u \r", __func__, l, numClusters); + } + if (clusterSize[l] == 0) continue; + + // + // Compute the residual vector of the new vector with its cluster + // centroids. + // resVectors[..] = newVectors[..] - clusterCenters[..] + // + utils::copy_selected(clusterSize[l], + dimDataset, + dataset, + originalNumbers + cluster_offsets[l], + dimDataset, + resVectors[devId], + dimDataset, + handle.get_stream()); + _cuann_a_me_b(clusterSize[l], + dimDataset, + resVectors[devId], + dimDataset, + clusterCenters + (uint64_t)l * dimDataset); + + // + // Rotate the residual vectors using a rotation matrix + // + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + dimRotDataset, + clusterSize[l], + dimDataset, + &alpha, + rotationMatrix, + dimDataset, + resVectors[devId], + dimDataset, + &beta, + rotVectors[devId], + dimRotDataset, + handle.get_stream()); + + // + // Training PQ codebook if codebook_gen::PER_CLUSTER + // (*) PQ codebooks are trained for each cluster. + // + if ((numIterations > 0) && (typePqCenter == codebook_gen::PER_CLUSTER)) { + uint32_t numTrainset = _get_num_trainset(clusterSize[l], dimPq, bitPq); + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + if (devId == 0) { + fprintf(stderr, + "(%s) Making PQ dataset: %u / %u, " + "Training PQ codebook (%u): %.1f / %u \r", + __func__, + l, + numClusters, + numTrainset, + (float)iter / 2, + numIterations); + } + _cuann_kmeans_predict(handle, + myPqCenters[devId], + (1 << bitPq), + lenPq, + rotVectors[devId], + CUDA_R_32F, + numTrainset, + rotVectorLabels[devId], + raft::distance::DistanceType::L2Expanded, + (iter != 0), + pqPredictWorkspace[devId], + myPqCentersTemp[devId], + pqClusterSize[devId], + true); + if ((iter + 1 < numIterations_2) && kmeans::adjust_centers(myPqCenters[devId], + (1 << bitPq), + lenPq, + rotVectors[devId], + numTrainset, + rotVectorLabels[devId], + pqClusterSize[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { + iter -= 1; + } + } + RAFT_CUDA_TRY(cudaMemcpy(pqCenters + ((1 << bitPq) * lenPq) * l, + myPqCenters[devId], + sizeof(float) * (1 << bitPq) * lenPq, + cudaMemcpyDeviceToHost)); + } + + // + // Change the order of the vector data to facilitate processing in + // each vector subspace. + // input: rotVectors[clusterSize, dimRotDataset] + // output: subVectors[dimPq, clusterSize, lenPq] + // + _cuann_transpose_copy_3d(lenPq, + clusterSize[l], + dimPq, + subVectors[devId], + lenPq, + clusterSize[l], + rotVectors[devId], + 1, + dimRotDataset, + lenPq); + + // + // Find a label (cluster ID) for each vector subspace. + // + for (uint32_t j = 0; j < dimPq; j++) { + float* curPqCenters = NULL; + if (typePqCenter == codebook_gen::PER_SUBSPACE) { + curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * j; + } else if (typePqCenter == codebook_gen::PER_CLUSTER) { + curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * l; + if (numIterations > 0) { curPqCenters = myPqCenters[devId]; } + } + _cuann_kmeans_predict(handle, + curPqCenters, + (1 << bitPq), + lenPq, + subVectors[devId] + j * (clusterSize[l] * lenPq), + CUDA_R_32F, + clusterSize[l], + subVectorLabels[devId] + j * clusterSize[l], + raft::distance::DistanceType::L2Expanded, + true, + pqPredictWorkspace[devId], + nullptr, + nullptr, + true); + } + + // + // PQ encoding + // + ivfpq_encode( + clusterSize[l], clusterSize[l], dimPq, bitPq, subVectorLabels[devId], myPqDataset[devId]); + RAFT_CUDA_TRY(cudaMemcpy(pqDataset + ((uint64_t)cluster_offsets[l] * dimPq * bitPq / 8), + myPqDataset[devId], + sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, + cudaMemcpyDeviceToHost)); + } + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + fprintf(stderr, "\n"); + + // + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); + _cuann_multi_device_free(myPqDataset, 1); + _cuann_multi_device_free(subVectorLabels, 1); + _cuann_multi_device_free(subVectors, 1); + _cuann_multi_device_free(rotVectors, 1); + _cuann_multi_device_free(resVectors, 1); + if ((numIterations > 0) && (typePqCenter == codebook_gen::PER_CLUSTER)) { + _cuann_multi_device_free(wsKAC, 1); + _cuann_multi_device_free(rotVectorLabels, 1); + _cuann_multi_device_free(pqClusterSize, 1); + _cuann_multi_device_free(myPqCenters, 1); + _cuann_multi_device_free(myPqCentersTemp, 1); + } +} + +// cuannIvfPqSetIndexParameters +inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, + const uint32_t numClusters, + const uint32_t numDataset, + const uint32_t dimDataset, + const uint32_t dimPq, + const uint32_t bitPq, + const distance::DistanceType metric, + const codebook_gen typePqCenter) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(numClusters > 0, "(%s) numClusters must be larger than zero.", __func__); + RAFT_EXPECTS(numDataset > 0, "(%s) numDataset must be larger than zero.", __func__); + RAFT_EXPECTS(dimDataset > 0, "(%s) dimDataset must be larger than zero.", __func__); + RAFT_EXPECTS(dimPq > 0, "(%s) dimPq must be larger than zero.", __func__); + RAFT_EXPECTS(numClusters <= numDataset, + "(%s) numClusters must be smaller than numDataset (numClusters:%u, numDataset:%u).", + __func__, + numClusters, + numDataset); + RAFT_EXPECTS(bitPq >= 4 && bitPq <= 8, + "(%s) bitPq must be within closed range [4,8], but got %u.", + __func__, + bitPq); + RAFT_EXPECTS((bitPq * dimPq) % 8 == 0, + "(%s) `bitPq * dimPq` must be a multiple of 8, but got %u * %u = %u.", + __func__, + bitPq, + dimPq, + bitPq * dimPq); + desc->numClusters = numClusters; + desc->numDataset = numDataset; + desc->dimDataset = dimDataset; + desc->dimDatasetExt = dimDataset + 1; + if (desc->dimDatasetExt % 8) { desc->dimDatasetExt += 8 - (desc->dimDatasetExt % 8); } + RAFT_EXPECTS(desc->dimDatasetExt >= dimDataset + 1, "unexpected dimDatasetExt"); + RAFT_EXPECTS(desc->dimDatasetExt % 8 == 0, "unexpected dimDatasetExt"); + desc->dimPq = dimPq; + desc->bitPq = bitPq; + desc->metric = metric; + desc->typePqCenter = typePqCenter; + + desc->dimRotDataset = dimDataset; + if (dimDataset % dimPq) { desc->dimRotDataset = ((dimDataset / dimPq) + 1) * dimPq; } + desc->lenPq = desc->dimRotDataset / dimPq; +} + +// cuannIvfPqGetIndexParameters +inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, + uint32_t* numClusters, + uint32_t* numDataset, + uint32_t* dimDataset, + uint32_t* dimPq, + uint32_t* bitPq, + distance::DistanceType* metric, + codebook_gen* typePqCenter) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + + *numClusters = desc->numClusters; + *numDataset = desc->numDataset; + *dimDataset = desc->dimDataset; + *dimPq = desc->dimPq; + *bitPq = desc->bitPq; + *metric = desc->metric; + *typePqCenter = desc->typePqCenter; +} + +// cuannIvfPqGetIndexSize +inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + + *size = sizeof(struct cuannIvfPqIndexHeader); + RAFT_EXPECTS(*size == 1024, "Critical error: unexpected header size."); + *size += _cuann_getIndexSize_clusterCenters(desc); + *size += _cuann_getIndexSize_pqCenters(desc); + *size += _cuann_getIndexSize_pqDataset(desc); + *size += _cuann_getIndexSize_originalNumbers(desc); + *size += _cuann_getIndexSize_indexPtr(desc); + *size += _cuann_getIndexSize_rotationMatrix(desc); + *size += _cuann_getIndexSize_clusterRotCenters(desc); +} + +template +void cuannIvfPqBuildIndex( + const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + const T* dataset, /* [numDataset, dimDataset] */ + const T* trainset, /* [numTrainset, dimDataset] */ + uint32_t numTrainset, /* Number of train-set entries */ + uint32_t numIterations, /* Number of iterations to train kmeans */ + bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ + bool hierarchicalClustering /* If true, do kmeans training hierarchically */) +{ + int cuannDevId = handle.get_device(); + int callerDevId = _cuann_set_device(cuannDevId); + + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8U; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8I; + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } + if (desc->metric == distance::DistanceType::InnerProduct) { + RAFT_EXPECTS(dtype == CUDA_R_32F, + "Unsupported dtype (inner-product metric support float only)"); + } + + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("cuannIvfPqBuildIndex: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + desc->dtypeDataset = dtype; + char dtypeString[64]; + _cuann_get_dtype_string(desc->dtypeDataset, dtypeString); + RAFT_LOG_DEBUG("Dataset dtype = %s", dtypeString); + + switch (utils::check_pointer_residency(dataset, trainset)) { + case utils::pointer_residency::host_only: + case utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("both dataset and trainsed must be accessible from the host."); + } + + if (desc->index_ptr != NULL) { RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); } + size_t index_size; + cuannIvfPqGetIndexSize(desc, &index_size); + RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), index_size)); + + struct cuannIvfPqIndexHeader* header; + float* clusterCenters; // [numClusters, dimDataset] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* cluster_offsets; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &cluster_offsets, + &rotationMatrix, + &clusterRotCenters); + + uint32_t* trainsetLabels; // [numTrainset] + RAFT_CUDA_TRY(cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset)); + + uint32_t* clusterSize; // [numClusters] + RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * desc->numClusters)); + + float* clusterCentersTemp; // [numClusters, dimDataset] + RAFT_CUDA_TRY( + cudaMallocManaged(&clusterCentersTemp, sizeof(float) * desc->numClusters * desc->dimDataset)); + + uint32_t** wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); + + // + // Training kmeans + // + if (hierarchicalClustering) { + RAFT_LOG_DEBUG("Hierarchical clustering: enabled"); + } else { + RAFT_LOG_DEBUG("Hierarchical clustering: disabled"); + } + if (hierarchicalClustering) { + // Hierarchical kmeans + uint32_t numMesoClusters = pow((double)(desc->numClusters), (double)1.0 / 2.0) + 0.5; + RAFT_LOG_DEBUG("numMesoClusters: %u", numMesoClusters); + + float* mesoClusterCenters; // [numMesoClusters, dimDataset] + RAFT_CUDA_TRY( + cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * desc->dimDataset)); + + float* mesoClusterCentersTemp; // [numMesoClusters, dimDataset] + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterCentersTemp, + sizeof(float) * numMesoClusters * desc->dimDataset)); + + uint32_t* mesoClusterLabels; // [numTrainset,] + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset)); + + uint32_t* mesoClusterSize; // [numMesoClusters,] + RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters)); + + // + // Training kmeans for meso-clusters + // + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + _cuann_kmeans_predict(handle, + mesoClusterCenters, + numMesoClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + mesoClusterLabels, + desc->metric, + (iter != 0), + NULL, + mesoClusterCentersTemp, + mesoClusterSize, + true); + if ((iter + 1 < numIterations_2) && kmeans::adjust_centers(mesoClusterCenters, + numMesoClusters, + desc->dimDataset, + trainset, + numTrainset, + mesoClusterLabels, + mesoClusterSize, + (float)1.0 / 4, + device_memory, + handle.get_stream())) { + iter -= 1; + if (desc->metric == distance::DistanceType::InnerProduct) { + utils::normalize_rows( + numMesoClusters, desc->dimDataset, mesoClusterCenters, handle.get_stream()); + } + } + } + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + + // Number of centers in each meso cluster + // [numMesoClusters,] + uint32_t* numFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * numMesoClusters); + + // [numMesoClusters + 1,] + uint32_t* csumFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * (numMesoClusters + 1)); + csumFineClusters[0] = 0; + + uint32_t numClustersRemain = desc->numClusters; + uint32_t numTrainsetRemain = numTrainset; + uint32_t mesoClusterSizeSum = 0; // check + uint32_t mesoClusterSizeMax = 0; + uint32_t numFineClustersMax = 0; + for (uint32_t i = 0; i < numMesoClusters; i++) { + if (i < numMesoClusters - 1) { + numFineClusters[i] = + (double)numClustersRemain * mesoClusterSize[i] / numTrainsetRemain + .5; + } else { + numFineClusters[i] = numClustersRemain; + } + csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i]; + + numClustersRemain -= numFineClusters[i]; + numTrainsetRemain -= mesoClusterSize[i]; + mesoClusterSizeSum += mesoClusterSize[i]; + mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]); + numFineClustersMax = max(numFineClustersMax, numFineClusters[i]); + } + RAFT_EXPECTS(mesoClusterSizeSum == numTrainset, "mesocluster sizes do not add up"); + RAFT_EXPECTS(csumFineClusters[numMesoClusters] == desc->numClusters, + "fine cluster sizes do not add up"); + + uint32_t** idsTrainset = + _cuann_multi_device_malloc(1, mesoClusterSizeMax, "idsTrainset"); + + float** subTrainset = + _cuann_multi_device_malloc(1, mesoClusterSizeMax * desc->dimDataset, "subTrainset"); + + // label (cluster ID) of each vector + uint32_t** labelsMP = _cuann_multi_device_malloc(1, mesoClusterSizeMax, "labelsMP"); + + float** clusterCentersEach = _cuann_multi_device_malloc( + 1, numFineClustersMax * desc->dimDataset, "clusterCentersEach"); + + float** clusterCentersMP = _cuann_multi_device_malloc( + 1, numFineClustersMax * desc->dimDataset, "clusterCentersMP"); + + // number of vectors in each cluster + uint32_t** clusterSizeMP = + _cuann_multi_device_malloc(1, numFineClustersMax, "clusterSizeMP"); + + size_t sizePredictWorkspace = 0; + for (uint32_t i = 0; i < numMesoClusters; i++) { + sizePredictWorkspace = + max(sizePredictWorkspace, + _cuann_kmeans_predict_bufferSize(numFineClusters[i], // number of centers + desc->dimDataset, + mesoClusterSize[i] // number of vectors + )); + } + void** predictWorkspace = + (void**)_cuann_multi_device_malloc(1, sizePredictWorkspace, "predictWorkspace"); + + // + // Training kmeans for clusters in each meso-clusters + // +#pragma omp parallel for schedule(dynamic) num_threads(1) + for (uint32_t i = 0; i < numMesoClusters; i++) { + int devId = omp_get_thread_num(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + + uint32_t k = 0; + for (uint32_t j = 0; j < numTrainset; j++) { + if (mesoClusterLabels[j] != i) continue; + idsTrainset[devId][k++] = j; + } + RAFT_EXPECTS(k == mesoClusterSize[i], "unexpected cluster size for cluster %u", i); + + utils::copy_selected(mesoClusterSize[i], + desc->dimDataset, + trainset, + idsTrainset[devId], + desc->dimDataset, + subTrainset[devId], + desc->dimDataset, + handle.get_stream()); + + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + _cuann_kmeans_predict(handle, + clusterCentersEach[devId], + numFineClusters[i], + desc->dimDataset, + subTrainset[devId], + CUDA_R_32F, + mesoClusterSize[i], + labelsMP[devId], + desc->metric, + (iter != 0), + predictWorkspace[devId], + clusterCentersMP[devId], + clusterSizeMP[devId], + true); + if ((iter + 1 < numIterations_2) && kmeans::adjust_centers(clusterCentersEach[devId], + numFineClusters[i], + desc->dimDataset, + subTrainset[devId], + mesoClusterSize[i], + labelsMP[devId], + clusterSizeMP[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { + iter -= 1; + if (desc->metric == distance::DistanceType::InnerProduct) { + utils::normalize_rows( + numFineClusters[i], desc->dimDataset, clusterCentersEach[devId], handle.get_stream()); + } + } + } + RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), + clusterCentersEach[devId], + sizeof(float) * numFineClusters[i] * desc->dimDataset, + cudaMemcpyDeviceToDevice)); + } + for (int devId = 0; devId < 1; devId++) { + RAFT_CUDA_TRY(cudaSetDevice(devId)); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + } + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); + + _cuann_multi_device_free(idsTrainset, 1); + _cuann_multi_device_free(subTrainset, 1); + _cuann_multi_device_free(labelsMP, 1); + _cuann_multi_device_free(clusterCentersEach, 1); + _cuann_multi_device_free(clusterCentersMP, 1); + _cuann_multi_device_free(clusterSizeMP, 1); + _cuann_multi_device_free((uint8_t**)predictWorkspace, 1); + + RAFT_CUDA_TRY(cudaFree(mesoClusterSize)); + RAFT_CUDA_TRY(cudaFree(mesoClusterLabels)); + RAFT_CUDA_TRY(cudaFree(mesoClusterCenters)); + RAFT_CUDA_TRY(cudaFree(mesoClusterCentersTemp)); + + free(numFineClusters); + free(csumFineClusters); + + // + // Fine-tuning kmeans for whole clusters (with multipel GPUs) + // + // (*) Since the likely cluster centroids have been calculated + // hierarchically already, the number of iteration for fine-tuning + // kmeans for whole clusters should be reduced. However, there + // is a possibility that the clusters could be unbalanced here, + // in which case the actual number of iterations would be increased. + // + const int X = 5; + int numIterations_X = max(numIterations / 10, 2) * X; + for (int iter = 0; iter < numIterations_X; iter += X) { + _cuann_kmeans_predict_MP(handle, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->metric, + true, + clusterSize, + true /* to update clusterCenters */); + if ((iter + 1 < numIterations_X) && kmeans::adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + numTrainset, + trainsetLabels, + clusterSize, + (float)1.0 / 5, + device_memory, + handle.get_stream())) { + iter -= (X - 1); + if (desc->metric == distance::DistanceType::InnerProduct) { + utils::normalize_rows( + desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); + } + } + } + } else { + // Flat kmeans + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + _cuann_kmeans_predict(handle, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->metric, + (iter != 0), + NULL, + clusterCentersTemp, + clusterSize, + true); + if ((iter + 1 < numIterations_2) && kmeans::adjust_centers(clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + numTrainset, + trainsetLabels, + clusterSize, + (float)1.0 / 4, + device_memory, + handle.get_stream())) { + iter -= 1; + if (desc->metric == distance::DistanceType::InnerProduct) { + utils::normalize_rows( + desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); + } + } + } + } + + uint32_t* datasetLabels; // [numDataset] + RAFT_CUDA_TRY(cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * desc->numDataset)); + + // + // Predict labels of whole dataset (with multiple GPUs) + // + _cuann_kmeans_predict_MP(handle, + clusterCenters, + desc->numClusters, + desc->dimDataset, + dataset, + dtype, + desc->numDataset, + datasetLabels, + desc->metric, + true, + clusterSize, + true /* to update clusterCenters */); + +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + _cuann_kmeans_show_centers(clusterCenters, desc->numClusters, desc->dimDataset, clusterSize); +#endif + + // Make rotation matrix + RAFT_LOG_DEBUG("# dimDataset: %u\n", desc->dimDataset); + RAFT_LOG_DEBUG("# dimRotDataset: %u\n", desc->dimRotDataset); + RAFT_LOG_DEBUG("# randomRotation: %s\n", randomRotation ? "enabled" : "disabled"); + _cuann_make_rotation_matrix( + desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); + + // Rotate clusterCenters + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + desc->dimRotDataset, + desc->numClusters, + desc->dimDataset, + &alpha, + rotationMatrix, + desc->dimDataset, + clusterCenters, + desc->dimDataset, + &beta, + clusterRotCenters, + desc->dimRotDataset, + handle.get_stream()); + + // + // Make cluster_offsets, originalNumbers and pqDataset + // + uint32_t maxClusterSize = 0; + // cluster_offsets + cluster_offsets[0] = 0; + for (uint32_t l = 0; l < desc->numClusters; l++) { + cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; + if (maxClusterSize < clusterSize[l]) { maxClusterSize = clusterSize[l]; } + } + RAFT_EXPECTS(cluster_offsets[desc->numClusters] == desc->numDataset, + "Cluster sizes do not add up"); + desc->maxClusterSize = maxClusterSize; + + // originalNumbers + for (uint32_t i = 0; i < desc->numDataset; i++) { + uint32_t l = datasetLabels[i]; + originalNumbers[cluster_offsets[l]] = i; + cluster_offsets[l] += 1; + } + + // Recover cluster_offsets + for (uint32_t l = 0; l < desc->numClusters; l++) { + cluster_offsets[l] -= clusterSize[l]; + } + + // [numDevices][1 << bitPq, lenPq] + float** pqCentersTemp = + _cuann_multi_device_malloc(1, (1 << desc->bitPq) * desc->lenPq, "pqCentersTemp"); + + // [numDevices][1 << bitPq,] + uint32_t** pqClusterSize = + _cuann_multi_device_malloc(1, (1 << desc->bitPq), "pqClusterSize"); + + // Allocate workspace for PQ codebook training + size_t sizePqPredictWorkspace = + _cuann_kmeans_predict_bufferSize((1 << desc->bitPq), desc->lenPq, numTrainset); + sizePqPredictWorkspace = max(sizePqPredictWorkspace, + _cuann_kmeans_predict_bufferSize( + (1 << desc->bitPq), desc->lenPq, maxClusterSize * desc->dimPq)); + void** pqPredictWorkspace = + (void**)_cuann_multi_device_malloc(1, sizePqPredictWorkspace, "pqPredictWorkspace"); + + if (desc->typePqCenter == codebook_gen::PER_SUBSPACE) { + // + // Training PQ codebook (codebook_gen::PER_SUBSPACE) + // (*) PQ codebooks are trained for each subspace. + // + + // Predict label of trainset again (with multiple GPUs) + _cuann_kmeans_predict_MP(handle, + clusterCenters, + desc->numClusters, + desc->dimDataset, + trainset, + dtype, + numTrainset, + trainsetLabels, + desc->metric, + true, + NULL, + false /* do not update clusterCenters */); + + // [dimPq, numTrainset, lenPq] + size_t sizeModTrainset = sizeof(float) * desc->dimPq * numTrainset * desc->lenPq; + float* modTrainset = (float*)malloc(sizeModTrainset); + memset(modTrainset, 0, sizeModTrainset); + + // modTrainset[] = transpose( rotate(trainset[]) - clusterRotCenters[] ) +#pragma omp parallel for + for (uint32_t i = 0; i < numTrainset; i++) { + uint32_t l = trainsetLabels[i]; + for (uint32_t j = 0; j < desc->dimRotDataset; j++) { + float val; + if (dtype == CUDA_R_32F) { + val = + _cuann_dot(desc->dimDataset, + (float*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1); + } else if (dtype == CUDA_R_8U) { + float divisor = 256.0; + val = _cuann_dot( + desc->dimDataset, + (uint8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1, + divisor); + } else if (dtype == CUDA_R_8I) { + float divisor = 128.0; + val = + _cuann_dot(desc->dimDataset, + (int8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), + 1, + rotationMatrix + ((uint64_t)(desc->dimDataset) * j), + 1, + divisor); + } + uint32_t j0 = j / (desc->lenPq); // 0 <= j0 < dimPq + uint32_t j1 = j % (desc->lenPq); // 0 <= j1 < lenPq + uint64_t idx = + j1 + ((uint64_t)(desc->lenPq) * i) + ((uint64_t)(desc->lenPq) * numTrainset * j0); + modTrainset[idx] = val - clusterRotCenters[j + (desc->dimRotDataset * l)]; + } + } + + // [numDevices][numTrainset, lenPq] + float** subTrainset = + _cuann_multi_device_malloc(1, numTrainset * desc->lenPq, "subTrainset"); + + // [numDevices][numTrainset] + uint32_t** subTrainsetLabels = + _cuann_multi_device_malloc(1, numTrainset, "subTrainsetLabels"); + + float** pqCentersEach = + _cuann_multi_device_malloc(1, ((1 << desc->bitPq) * desc->lenPq), "pqCentersEach"); + +#pragma omp parallel for schedule(dynamic) num_threads(1) + for (uint32_t j = 0; j < desc->dimPq; j++) { + int devId = omp_get_thread_num(); + RAFT_CUDA_TRY(cudaSetDevice(devId)); + + float* curPqCenters = pqCenters + ((1 << desc->bitPq) * desc->lenPq) * j; + RAFT_CUDA_TRY(cudaMemcpy(subTrainset[devId], + modTrainset + ((uint64_t)numTrainset * desc->lenPq * j), + sizeof(float) * numTrainset * desc->lenPq, + cudaMemcpyHostToDevice)); + // Train kmeans for each PQ + int numIterations_2 = numIterations * 2; + for (int iter = 0; iter < numIterations_2; iter += 2) { + if (devId == 0) { + fprintf(stderr, + "(%s) Training PQ codebook %u (out of %u): " + "%.1f / %u \r", + __func__, + j, + desc->dimPq, + (float)iter / 2, + numIterations); + } + _cuann_kmeans_predict(handle, + pqCentersEach[devId], + (1 << desc->bitPq), + desc->lenPq, + subTrainset[devId], + CUDA_R_32F, + numTrainset, + subTrainsetLabels[devId], + raft::distance::DistanceType::L2Expanded, + (iter != 0), + pqPredictWorkspace[devId], + pqCentersTemp[devId], + pqClusterSize[devId], + true); + if ((iter + 1 < numIterations_2) && kmeans::adjust_centers(pqCentersEach[devId], + (1 << desc->bitPq), + desc->lenPq, + subTrainset[devId], + numTrainset, + subTrainsetLabels[devId], + pqClusterSize[devId], + (float)1.0 / 4, + device_memory, + handle.get_stream())) { + iter -= 1; + } + } + RAFT_CUDA_TRY(cudaMemcpy(curPqCenters, + pqCentersEach[devId], + sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), + cudaMemcpyDeviceToDevice)); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + if (j == 0) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + _cuann_kmeans_show_centers( + curPqCenters, (1 << desc->bitPq), desc->lenPq, pqClusterSize[devId]); + } +#endif + } + fprintf(stderr, "\n"); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); + + _cuann_multi_device_free(subTrainset, 1); + _cuann_multi_device_free(subTrainsetLabels, 1); + _cuann_multi_device_free(pqCentersEach, 1); + free(modTrainset); + } + + // + // Compute PQ code for whole dataset + // + _cuann_compute_PQ_code(handle, + desc->numDataset, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq, + desc->lenPq, + desc->bitPq, + desc->numClusters, + desc->typePqCenter, + maxClusterSize, + clusterCenters, + rotationMatrix, + dataset, + originalNumbers, + clusterSize, + cluster_offsets, + pqCenters, + numIterations, + pqDataset); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); + + // + _cuann_get_inclusiveSumSortedClusterSize( + desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + _cuann_get_sqsumClusters(desc, clusterCenters, &(desc->sqsumClusters)); + + { + // combine clusterCenters and sqsumClusters + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + float* tmpClusterCenters; // [numClusters, dimDataset] + RAFT_CUDA_TRY( + cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset)); + for (uint32_t i = 0; i < desc->numClusters * desc->dimDataset; i++) { + tmpClusterCenters[i] = clusterCenters[i]; + } + for (uint32_t i = 0; i < desc->numClusters; i++) { + for (uint32_t j = 0; j < desc->dimDataset; j++) { + clusterCenters[j + (desc->dimDatasetExt * i)] = + tmpClusterCenters[j + (desc->dimDataset * i)]; + } + clusterCenters[desc->dimDataset + (desc->dimDatasetExt * i)] = desc->sqsumClusters[i]; + } + RAFT_CUDA_TRY(cudaFree(tmpClusterCenters)); + } + + // + cuannIvfPqGetIndexSize(desc, &(header->indexSize)); + header->metric = desc->metric; + header->numClusters = desc->numClusters; + header->numDataset = desc->numDataset; + header->dimDataset = desc->dimDataset; + header->dimPq = desc->dimPq; + header->maxClusterSize = maxClusterSize; + header->dimRotDataset = desc->dimRotDataset; + header->bitPq = desc->bitPq; + header->typePqCenter = (uint32_t)(desc->typePqCenter); + header->dtypeDataset = desc->dtypeDataset; + header->dimDatasetExt = desc->dimDatasetExt; + header->numDatasetAdded = 0; + + // + RAFT_CUDA_TRY(cudaFree(clusterSize)); + RAFT_CUDA_TRY(cudaFree(trainsetLabels)); + RAFT_CUDA_TRY(cudaFree(datasetLabels)); + RAFT_CUDA_TRY(cudaFree(clusterCentersTemp)); + + _cuann_multi_device_free(wsKAC, 1); + _cuann_multi_device_free(pqCentersTemp, 1); + _cuann_multi_device_free(pqClusterSize, 1); + _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); + + _cuann_set_device(callerDevId); +} + +// cuannIvfPqSaveIndex +inline void cuannIvfPqSaveIndex(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + const char* fileName) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + int orgDevId = _cuann_set_device(handle.get_device()); + + FILE* fp = fopen(fileName, "w"); + RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); + + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); + RAFT_LOG_DEBUG("indexSize: %lu\n", header->indexSize); + if (fwrite(desc->index_ptr, 1, header->indexSize, fp) != header->indexSize) { + RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); + } + fclose(fp); + + _cuann_set_device(orgDevId); +} + +// cuannIvfPqLoadIndex +inline void cuannIvfPqLoadIndex(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + const char* fileName) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + int orgDevId = _cuann_set_device(handle.get_device()); + + if (1 /* *index == NULL */) { + FILE* fp = fopen(fileName, "r"); + RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); + + if (desc->index_ptr != NULL) { RAFT_CUDA_TRY(cudaFree(desc->index_ptr)); } + size_t indexSize; + fread(&indexSize, sizeof(size_t), 1, fp); + RAFT_LOG_DEBUG("indexSize: %lu\n", indexSize); + RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), indexSize)); + fseek(fp, 0, SEEK_SET); + if (fread(desc->index_ptr, 1, indexSize, fp) != indexSize) { + RAFT_FAIL("(%s) failed to load index to from file (%s)\n", __func__, fileName); + } + fclose(fp); + + RAFT_CUDA_TRY( + cudaMemAdvise(desc->index_ptr, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); + } + + struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); + desc->numClusters = header->numClusters; + desc->numDataset = header->numDataset; + desc->dimDataset = header->dimDataset; + desc->dimPq = header->dimPq; + desc->metric = (distance::DistanceType)header->metric; + desc->maxClusterSize = header->maxClusterSize; + desc->dimRotDataset = header->dimRotDataset; + desc->lenPq = desc->dimRotDataset / desc->dimPq; + desc->bitPq = header->bitPq; + desc->typePqCenter = (codebook_gen)header->typePqCenter; + desc->dtypeDataset = (cudaDataType_t)header->dtypeDataset; + desc->dimDatasetExt = header->dimDatasetExt; + desc->indexVersion = header->version; + + float* clusterCenters; // [numClusters, dimDatasetExt] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* cluster_offsets; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &cluster_offsets, + &rotationMatrix, + &clusterRotCenters); + + // + _cuann_get_inclusiveSumSortedClusterSize( + desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); + + size_t size; + // pqDataset + size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; + if (size < handle.get_device_properties().totalGlobalMem) { + RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqDataset, size, handle.get_device())); + } + // clusterCenters + size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; + RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterCenters, size, handle.get_device())); + // pqCenters + if (desc->typePqCenter == codebook_gen::PER_SUBSPACE) { + size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; + } else { + size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; + } + RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqCenters, size, handle.get_device())); + // originalNumbers + size = sizeof(uint32_t) * desc->numDataset; + RAFT_CUDA_TRY(cudaMemPrefetchAsync(originalNumbers, size, handle.get_device())); + // cluster_offsets + size = sizeof(uint32_t) * (desc->numClusters + 1); + RAFT_CUDA_TRY(cudaMemPrefetchAsync(cluster_offsets, size, handle.get_device())); + // rotationMatrix + if (rotationMatrix != NULL) { + size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; + RAFT_CUDA_TRY(cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device())); + } + // clusterRotCenters + if (clusterRotCenters != NULL) { + size = sizeof(float) * desc->numClusters * desc->dimRotDataset; + RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device())); + } + + _cuann_set_device(orgDevId); +} + +template +auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( + const handle_t& handle, + cuannIvfPqDescriptor_t& oldDesc, + const T* newVectors, /* [numNewVectors, dimDataset] */ + uint32_t numNewVectors) -> cuannIvfPqDescriptor_t +{ + switch (utils::check_pointer_residency(newVectors)) { + case utils::pointer_residency::host_only: + case utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("newVectors must be accessible from the host."); + } + int cuannDevId = handle.get_device(); + int callerDevId = _cuann_set_device(cuannDevId); + + cudaDataType_t dtype = oldDesc->dtypeDataset; + if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_32F, + "The old index type (%d) doesn't much CUDA_R_32F required by the template instantiation", + dtype); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_8U, + "The old index type (%d) doesn't much CUDA_R_8U required by the template instantiation", + dtype); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + dtype == CUDA_R_8I, + "The old index type (%d) doesn't much CUDA_R_8I required by the template instantiation", + dtype); + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } + + char dtypeString[64]; + _cuann_get_dtype_string(dtype, dtypeString); + RAFT_LOG_DEBUG("dtype: %s", dtypeString); + RAFT_LOG_DEBUG("dimDataset: %u", oldDesc->dimDataset); + struct cuannIvfPqIndexHeader* oldHeader; + float* oldClusterCenters; // [numClusters, dimDatasetExt] + float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* oldPqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* oldOriginalNumbers; // [numDataset] + uint32_t* old_cluster_offsets; // [numClusters + 1] + float* oldRotationMatrix; // [dimDataset, dimRotDataset] + float* oldClusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(oldDesc, + &oldHeader, + &oldClusterCenters, + &oldPqCenters, + &oldPqDataset, + &oldOriginalNumbers, + &old_cluster_offsets, + &oldRotationMatrix, + &oldClusterRotCenters); + + // + // The clusterCenters stored in index contain data other than cluster + // centroids to speed up the search. Here, only the cluster centroids + // are extracted. + // + float* clusterCenters; // [numClusters, dimDataset] + RAFT_CUDA_TRY( + cudaMallocManaged(&clusterCenters, sizeof(float) * oldDesc->numClusters * oldDesc->dimDataset)); + for (uint32_t i = 0; i < oldDesc->numClusters; i++) { + memcpy(clusterCenters + (uint64_t)i * oldDesc->dimDataset, + oldClusterCenters + (uint64_t)i * oldDesc->dimDatasetExt, + sizeof(float) * oldDesc->dimDataset); + } + + // + // Use the existing cluster centroids to find the label (cluster ID) + // of the vector to be added. + // + uint32_t* newVectorLabels; // [numNewVectors,] + RAFT_CUDA_TRY(cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors)); + RAFT_CUDA_TRY(cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors)); + uint32_t* clusterSize; // [numClusters,] + RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters)); + RAFT_CUDA_TRY(cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters)); + _cuann_kmeans_predict_MP(handle, + clusterCenters, + oldDesc->numClusters, + oldDesc->dimDataset, + newVectors, + dtype, + numNewVectors, + newVectorLabels, + oldDesc->metric, + true, + clusterSize, + false /* do not update clusterCenters */); + +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + { + const int _num_show = 10; + fprintf(stderr, "# numNewVectors: %u\n", numNewVectors); + fprintf(stderr, "# newVectorLabels: "); + for (uint32_t i = 0; i < numNewVectors; i++) { + if ((i < _num_show) || (numNewVectors - i <= _num_show)) { + fprintf(stderr, "%u, ", newVectorLabels[i]); + } else if (i == _num_show) { + fprintf(stderr, "..., "); + } + } + fprintf(stderr, "\n"); + } + { + const int _num_show = 10; + fprintf(stderr, "# oldDesc->numClusters: %u\n", oldDesc->numClusters); + fprintf(stderr, "# clusterSize: "); + int _sum = 0; + for (uint32_t i = 0; i < oldDesc->numClusters; i++) { + _sum += clusterSize[i]; + if ((i < _num_show) || (oldDesc->numClusters - i <= _num_show)) { + fprintf(stderr, "%u, ", clusterSize[i]); + } else if (i == _num_show) { + fprintf(stderr, "..., "); + } + } + fprintf(stderr, "\n"); + fprintf(stderr, "# _sum: %d\n", _sum); + } +#endif + + // + // Make cluster_offsets, originalNumbers + // + uint32_t maxClusterSize = 0; + uint32_t* cluster_offsets; // [numClusters + 1] + uint32_t* originalNumbers; // [numNewVectors] + cluster_offsets = (uint32_t*)malloc(sizeof(uint32_t) * (oldDesc->numClusters + 1)); + originalNumbers = (uint32_t*)malloc(sizeof(uint32_t) * numNewVectors); + // cluster_offsets + cluster_offsets[0] = 0; + for (uint32_t l = 0; l < oldDesc->numClusters; l++) { + cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; + maxClusterSize = max(maxClusterSize, clusterSize[l]); + } + RAFT_EXPECTS(cluster_offsets[oldDesc->numClusters] == numNewVectors, + "cluster sizes do not add up."); + // originalNumbers + for (uint32_t i = 0; i < numNewVectors; i++) { + uint32_t l = newVectorLabels[i]; + originalNumbers[cluster_offsets[l]] = i; + cluster_offsets[l] += 1; + } + // Recover cluster_offsets + for (uint32_t l = 0; l < oldDesc->numClusters; l++) { + cluster_offsets[l] -= clusterSize[l]; + } + + // + // Compute PQ code for new vectors + // + uint8_t* pqDataset; // [numNewVectors, dimPq * bitPq / 8] + RAFT_CUDA_TRY(cudaMallocManaged( + &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8)); + _cuann_compute_PQ_code(handle, + numNewVectors, + oldDesc->dimDataset, + oldDesc->dimRotDataset, + oldDesc->dimPq, + oldDesc->lenPq, + oldDesc->bitPq, + oldDesc->numClusters, + oldDesc->typePqCenter, + maxClusterSize, + clusterCenters, + oldRotationMatrix, + newVectors, + originalNumbers, + clusterSize, + cluster_offsets, + oldPqCenters, + 0, + pqDataset); + RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); + + // + // Create descriptor for new index + // + auto newDesc = cuannIvfPqCreateDescriptor(); + memcpy(newDesc.get(), oldDesc.get(), sizeof(struct cuannIvfPqDescriptor)); + newDesc->numDataset += numNewVectors; + newDesc->inclusiveSumSortedClusterSize = nullptr; + newDesc->sqsumClusters = nullptr; + newDesc->index_ptr = nullptr; + RAFT_LOG_DEBUG("numDataset: %u -> %u", oldDesc->numDataset, newDesc->numDataset); + + // + // Allocate memory for new index + // + size_t newIndexSize; + cuannIvfPqGetIndexSize(newDesc, &newIndexSize); + RAFT_LOG_DEBUG("indexSize: %lu -> %lu", oldHeader->indexSize, newIndexSize); + RAFT_CUDA_TRY(cudaMallocManaged(&(newDesc->index_ptr), newIndexSize)); + memset(newDesc->index_ptr, 0, newIndexSize); + struct cuannIvfPqIndexHeader* newHeader; + float* newClusterCenters; // [numClusters, dimDatasetExt] + float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* newPqDataset; // [numDataset, dimPq * bitPq / 8] *** + uint32_t* newOriginalNumbers; // [numDataset] *** + uint32_t* new_cluster_offsets; // [numClusters + 1] *** + float* newRotationMatrix; // [dimDataset, dimRotDataset] + float* newClusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(newDesc, + &newHeader, + &newClusterCenters, + &newPqCenters, + &newPqDataset, + &newOriginalNumbers, + &new_cluster_offsets, + &newRotationMatrix, + &newClusterRotCenters); + + // + // Copy the unchanged parts + // header, clusterCenters, pqCenters, rotationMatrix, clusterRotCenters + // + memcpy(newHeader, oldHeader, sizeof(struct cuannIvfPqIndexHeader)); + { + cuannIvfPqGetIndexSize(newDesc, &(newHeader->indexSize)); + newHeader->numDataset = newDesc->numDataset; + newHeader->numDatasetAdded += numNewVectors; + } + memcpy(newClusterCenters, oldClusterCenters, _cuann_getIndexSize_clusterCenters(oldDesc)); + memcpy(newPqCenters, oldPqCenters, _cuann_getIndexSize_pqCenters(oldDesc)); + memcpy(newRotationMatrix, oldRotationMatrix, _cuann_getIndexSize_rotationMatrix(oldDesc)); + memcpy( + newClusterRotCenters, oldClusterRotCenters, _cuann_getIndexSize_clusterRotCenters(oldDesc)); + + // + // Make new_cluster_offsets + // + maxClusterSize = 0; + new_cluster_offsets[0] = 0; + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + new_cluster_offsets[l + 1] = new_cluster_offsets[l]; + new_cluster_offsets[l + 1] += oldClusterSize + clusterSize[l]; + maxClusterSize = max(maxClusterSize, oldClusterSize + clusterSize[l]); + } + { + newDesc->maxClusterSize = maxClusterSize; + newHeader->maxClusterSize = maxClusterSize; + } + RAFT_LOG_DEBUG("maxClusterSize: %u -> %u", oldDesc->maxClusterSize, newDesc->maxClusterSize); + + // + // Make newOriginalNumbers + // + for (uint32_t i = 0; i < numNewVectors; i++) { + originalNumbers[i] += oldDesc->numDataset; + } + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + memcpy(newOriginalNumbers + new_cluster_offsets[l], + oldOriginalNumbers + old_cluster_offsets[l], + sizeof(uint32_t) * oldClusterSize); + memcpy(newOriginalNumbers + new_cluster_offsets[l] + oldClusterSize, + originalNumbers + cluster_offsets[l], + sizeof(uint32_t) * clusterSize[l]); + } + + // + // Make newPqDataset + // + size_t unitPqDataset = newDesc->dimPq * newDesc->bitPq / 8; + for (uint32_t l = 0; l < newDesc->numClusters; l++) { + uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; + memcpy(newPqDataset + unitPqDataset * new_cluster_offsets[l], + oldPqDataset + unitPqDataset * old_cluster_offsets[l], + sizeof(uint8_t) * unitPqDataset * oldClusterSize); + memcpy(newPqDataset + unitPqDataset * (new_cluster_offsets[l] + oldClusterSize), + pqDataset + unitPqDataset * cluster_offsets[l], + sizeof(uint8_t) * unitPqDataset * clusterSize[l]); + } + + _cuann_get_inclusiveSumSortedClusterSize( + newDesc, new_cluster_offsets, newClusterCenters, &(newDesc->inclusiveSumSortedClusterSize)); + + // + // Done + // + if (newHeader->numDatasetAdded * 2 > newHeader->numDataset) { + RAFT_LOG_INFO( + "The total number of vectors in the new index" + " is now more than twice the initial number of vectors." + " You may want to re-build the index from scratch." + " (numVectors: %u, numVectorsAdded: %u)", + newHeader->numDataset, + newHeader->numDatasetAdded); + } + + free(originalNumbers); + free(cluster_offsets); + + RAFT_CUDA_TRY(cudaFree(pqDataset)); + RAFT_CUDA_TRY(cudaFree(clusterSize)); + RAFT_CUDA_TRY(cudaFree(newVectorLabels)); + RAFT_CUDA_TRY(cudaFree(clusterCenters)); + + _cuann_set_device(callerDevId); + return newDesc; +} + +// cuannIvfPqSetSearchParameters +inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t& desc, + const uint32_t numProbes, + const uint32_t topK) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(numProbes > 0, "numProbes must be larger than zero"); + RAFT_EXPECTS(topK > 0, "topK must be larger than zero"); + RAFT_EXPECTS(numProbes <= desc->numClusters, + "numProbes (%u) must be not larger than numClusters (%u)", + numProbes, + desc->numClusters); + RAFT_EXPECTS(topK <= desc->numDataset, + "topK (%u) must be not larger than numDataset (%u)", + numProbes, + desc->numDataset); + + uint32_t numSamplesWorstCase = desc->numDataset; + if (numProbes < desc->numClusters) { + numSamplesWorstCase = + desc->numDataset - + desc->inclusiveSumSortedClusterSize[desc->numClusters - 1 - numProbes - + desc->_numClustersSize0]; // (*) urgent WA, need to be + // fixed. + } + RAFT_EXPECTS(topK <= numSamplesWorstCase, + "numProbes is too small to get topK results reliably (numProbes: %u, topK: %u, " + "numSamplesWorstCase: %u).", + numProbes, + topK, + numSamplesWorstCase); + desc->numProbes = numProbes; + desc->topK = topK; + desc->maxSamples = desc->inclusiveSumSortedClusterSize[numProbes - 1]; + if (desc->maxSamples % 128) { desc->maxSamples += 128 - (desc->maxSamples % 128); } + desc->internalDistanceDtype = CUDA_R_32F; + desc->smemLutDtype = CUDA_R_32F; + desc->preferredThreadBlockSize = 0; +} + +// cuannIvfPqSetSearchParameters +inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, + cudaDataType_t internalDistanceDtype, + cudaDataType_t smemLutDtype, + const uint32_t preferredThreadBlockSize) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + RAFT_EXPECTS(internalDistanceDtype == CUDA_R_16F || internalDistanceDtype == CUDA_R_32F, + "internalDistanceDtype must be either CUDA_R_16F or CUDA_R_32F"); + RAFT_EXPECTS( + smemLutDtype == CUDA_R_16F || smemLutDtype == CUDA_R_32F || smemLutDtype == CUDA_R_8U, + "smemLutDtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U"); + RAFT_EXPECTS(preferredThreadBlockSize == 256 || preferredThreadBlockSize == 512 || + preferredThreadBlockSize == 1024 || preferredThreadBlockSize == 0, + "preferredThreadBlockSize must be 0, 256, 512 or 1024, but %u is given.", + preferredThreadBlockSize); + desc->internalDistanceDtype = internalDistanceDtype; + desc->smemLutDtype = smemLutDtype; + desc->preferredThreadBlockSize = preferredThreadBlockSize; +} + +// cuannIvfPqGetSearchParameters +inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t& desc, + uint32_t* numProbes, + uint32_t* topK) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + *numProbes = desc->numProbes; + *topK = desc->topK; +} + +// cuannIvfPqGetSearchTuningParameters +inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, + cudaDataType_t* internalDistanceDtype, + cudaDataType_t* smemLutDtype, + uint32_t* preferredThreadBlockSize) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + *internalDistanceDtype = desc->internalDistanceDtype; + *smemLutDtype = desc->smemLutDtype; + *preferredThreadBlockSize = desc->preferredThreadBlockSize; +} + +// cuannIvfPqSearch +inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + uint32_t maxQueries, + size_t maxWorkspaceSize, + size_t* workspaceSize) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + + size_t max_ws = maxWorkspaceSize; + if (max_ws == 0) { + max_ws = (size_t)1 * 1024 * 1024 * 1024; // default, 1GB + } else { + max_ws = max(max_ws, (size_t)512 * 1024 * 1024); + } + + size_t size_0 = + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes) + // clusterLabels.. + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistances + _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); + if (size_0 > max_ws) { + maxQueries = maxQueries * max_ws / size_0; + if (maxQueries > 32) { maxQueries -= (maxQueries % 32); } + } + // maxQueries = min(max(maxQueries, 1), 1024); + // maxQueries = min(max(maxQueries, 1), 2048); + maxQueries = min(max(maxQueries, 1), 4096); + desc->maxQueries = maxQueries; + + *workspaceSize = + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries + Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes); // clusterLabels.. + + max_ws -= *workspaceSize; + desc->maxBatchSize = 1; + while (1) { + uint32_t nextBatchSize = desc->maxBatchSize * max_ws / ivfpq_search_bufferSize(handle, desc); + if (desc->maxBatchSize >= nextBatchSize) break; + desc->maxBatchSize = nextBatchSize; + } + desc->maxBatchSize = min(max(desc->maxBatchSize, 1), maxQueries); + + if (maxQueries > desc->maxBatchSize) { + // Adjust maxBatchSize to reduce workspace size. + uint32_t num = (maxQueries + desc->maxBatchSize - 1) / desc->maxBatchSize; + if (1 < num && num < 5) { desc->maxBatchSize = (maxQueries + num - 1) / num; } + } + + if (1) { + // Adjust maxBatchSize to improve GPU occupancy of topk kernel. + uint32_t numCta_total = getMultiProcessorCount() * 2; + uint32_t numCta_perBatch = numCta_total / desc->maxBatchSize; + float utilization = (float)numCta_perBatch * desc->maxBatchSize / numCta_total; + if (numCta_perBatch > 1 || (numCta_perBatch == 1 && utilization < 0.6)) { + uint32_t numCta_perBatch_1 = numCta_perBatch + 1; + uint32_t maxBatchSize_1 = numCta_total / numCta_perBatch_1; + float utilization_1 = (float)numCta_perBatch_1 * maxBatchSize_1 / numCta_total; + if (utilization < utilization_1) { desc->maxBatchSize = maxBatchSize_1; } + } + } + + size_t size_1 = + Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistance + _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); + size_t size_2 = ivfpq_search_bufferSize(handle, desc); + *workspaceSize += max(size_1, size_2); + + RAFT_LOG_TRACE("maxQueries: %u", maxQueries); + RAFT_LOG_TRACE("maxBatchSize: %u", desc->maxBatchSize); + RAFT_LOG_DEBUG( + "workspaceSize: %lu (%.3f GiB)", *workspaceSize, (float)*workspaceSize / 1024 / 1024 / 1024); +} + +template +void cuannIvfPqSearch(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + const T* queries, /* [numQueries, dimDataset], host or device pointer */ + uint32_t numQueries, + uint64_t* neighbors, /* [numQueries, topK], device pointer */ + float* distances, /* [numQueries, topK], device pointer */ + void* workspace) +{ + RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); + int orgDevId = _cuann_set_device(handle.get_device()); + + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8U; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_8I; + } else { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "unsupported type"); + } + + struct cuannIvfPqIndexHeader* header; + float* clusterCenters; // [numClusters, dimDatasetExt] + float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or + // [numClusters, 1 << bitPq, lenPq] + uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] + uint32_t* originalNumbers; // [numDataset] + uint32_t* cluster_offsets; // [numClusters + 1] + float* rotationMatrix; // [dimDataset, dimRotDataset] + float* clusterRotCenters; // [numClusters, dimRotDataset] + _cuann_get_index_pointers(desc, + &header, + &clusterCenters, + &pqCenters, + &pqDataset, + &originalNumbers, + &cluster_offsets, + &rotationMatrix, + &clusterRotCenters); + // + void* devQueries; // [maxQueries, dimDatasetExt] + float* curQueries; // [maxQueries, dimDatasetExt] + float* rotQueries; // [maxQueries, dimRotDataset] + uint32_t* clusterLabelsToProbe; // [maxQueries, numProbes] + float* QCDistances; // [maxQueries, numClusters] + void* topkWorkspace; + void* searchWorkspace; + devQueries = (void*)workspace; + curQueries = (float*)((uint8_t*)devQueries + + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + rotQueries = (float*)((uint8_t*)curQueries + + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); + clusterLabelsToProbe = + (uint32_t*)((uint8_t*)rotQueries + + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimRotDataset)); + // + QCDistances = (float*)((uint8_t*)clusterLabelsToProbe + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + topkWorkspace = (void*)((uint8_t*)QCDistances + + Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->numClusters)); + // + searchWorkspace = + (void*)((uint8_t*)clusterLabelsToProbe + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); + + void (*_ivfpq_search)(const handle_t&, + cuannIvfPqDescriptor_t&, + uint32_t, + const float*, + const float*, + const uint8_t*, + const uint32_t*, + const uint32_t*, + const uint32_t*, + const float*, + uint64_t*, + float*, + void*); + if (desc->internalDistanceDtype == CUDA_R_16F) { + if (desc->smemLutDtype == CUDA_R_16F) { + _ivfpq_search = ivfpq_search; + } else if (desc->smemLutDtype == CUDA_R_8U) { + _ivfpq_search = ivfpq_search>; + } else { + _ivfpq_search = ivfpq_search; + } + } else { + if (desc->smemLutDtype == CUDA_R_16F) { + _ivfpq_search = ivfpq_search; + } else if (desc->smemLutDtype == CUDA_R_8U) { + _ivfpq_search = ivfpq_search>; + } else { + _ivfpq_search = ivfpq_search; + } + } + + switch (utils::check_pointer_residency(neighbors, distances)) { + case utils::pointer_residency::device_only: + case utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("output pointers must be accessible from the device."); + } + + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, queries)); + + for (uint32_t i = 0; i < numQueries; i += desc->maxQueries) { + uint32_t nQueries = min(desc->maxQueries, numQueries - i); + + float fillValue = 0.0; + if (desc->metric != raft::distance::DistanceType::InnerProduct) { fillValue = 1.0 / -2.0; } + float divisor = 1.0; + if (desc->dtypeDataset == CUDA_R_8U) { + divisor = 256.0; + } else if (desc->dtypeDataset == CUDA_R_8I) { + divisor = 128.0; + } + if (dtype == CUDA_R_32F) { + float* ptrQueries = (float*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(float) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); + ptrQueries = (float*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle.get_stream()); + } else if (dtype == CUDA_R_8U) { + uint8_t* ptrQueries = (uint8_t*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(uint8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); + ptrQueries = (uint8_t*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle.get_stream()); + } else if (dtype == CUDA_R_8I) { + int8_t* ptrQueries = (int8_t*)queries + ((uint64_t)(desc->dimDataset) * i); + if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { + RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, + ptrQueries, + sizeof(int8_t) * nQueries * desc->dimDataset, + cudaMemcpyHostToDevice, + handle.get_stream())); + ptrQueries = (int8_t*)devQueries; + } + _cuann_copy_fill(nQueries, + desc->dimDataset, + ptrQueries, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + fillValue, + divisor, + handle.get_stream()); + } + + float alpha; + float beta; + uint32_t gemmK = desc->dimDataset; + if (desc->metric == distance::DistanceType::InnerProduct) { + alpha = -1.0; + beta = 0.0; + } else { + alpha = -2.0; + beta = 0.0; + gemmK = desc->dimDataset + 1; + RAFT_EXPECTS(gemmK <= desc->dimDatasetExt, "unexpected gemmK or dimDatasetExt"); + } + linalg::gemm(handle, + true, + false, + desc->numClusters, + nQueries, + gemmK, + &alpha, + clusterCenters, + desc->dimDatasetExt, + curQueries, + desc->dimDatasetExt, + &beta, + QCDistances, + desc->numClusters, + handle.get_stream()); + + // Rotate queries + alpha = 1.0; + beta = 0.0; + linalg::gemm(handle, + true, + false, + desc->dimRotDataset, + nQueries, + desc->dimDataset, + &alpha, + rotationMatrix, + desc->dimDataset, + curQueries, + desc->dimDatasetExt, + &beta, + rotQueries, + desc->dimRotDataset, + handle.get_stream()); + + // Select neighbor clusters for each query. + _cuann_find_topk(handle, + desc->numProbes, + nQueries, + desc->numClusters, + NULL, + QCDistances, + clusterLabelsToProbe, + topkWorkspace, + false); + + for (uint32_t j = 0; j < nQueries; j += desc->maxBatchSize) { + uint32_t batchSize = min(desc->maxBatchSize, nQueries - j); + _ivfpq_search(handle, + desc, + batchSize, + clusterRotCenters, + pqCenters, + pqDataset, + originalNumbers, + cluster_offsets, + clusterLabelsToProbe + ((uint64_t)(desc->numProbes) * j), + rotQueries + ((uint64_t)(desc->dimRotDataset) * j), + neighbors + ((uint64_t)(desc->topK) * (i + j)), + distances + ((uint64_t)(desc->topK) * (i + j)), + searchWorkspace); + } + } + + _cuann_set_device(orgDevId); +} + +// +template +__device__ inline float ivfpq_compute_score( + uint32_t dimPq, + uint32_t iDataset, + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const smemLutDtype* preCompScores, // [dimPq, 1 << bitPq] + bool earlyStop, + float kth_score = FLT_MAX) +{ + float score = 0.0; + constexpr uint32_t bitT = sizeof(T) * 8; + const T* headPqDataset = (T*)(pqDataset + (uint64_t)iDataset * (dimPq * bitPq / 8)); + for (int j = 0; j < dimPq / vecLen; j += 1) { + T pqCode = headPqDataset[0]; + headPqDataset += 1; + uint32_t bitLeft = bitT; +#pragma unroll vecLen + for (int k = 0; k < vecLen; k += 1) { + uint8_t code = pqCode; + if (bitLeft > bitPq) { + // This condition is always true here (to make the compiler happy) + if constexpr (bitT > bitPq) { pqCode >>= bitPq; } + bitLeft -= bitPq; + } else { + if (k < vecLen - 1) { + pqCode = headPqDataset[0]; + headPqDataset += 1; + } + code |= (pqCode << bitLeft); + pqCode >>= (bitPq - bitLeft); + bitLeft += (bitT - bitPq); + } + code &= (1 << bitPq) - 1; + score += (float)preCompScores[code]; + preCompScores += (1 << bitPq); + + if (earlyStop && (vecLen > 8) && ((k % 8) == 0)) { + if (score > kth_score) { return FLT_MAX; } + } + } + if (earlyStop && (vecLen <= 8)) { + if (score > kth_score) { return FLT_MAX; } + } + } + return score; +} + +// +template +__device__ inline void warp_merge(K& key, bool acending = true, int group_size = 32) +{ + int lane_id = threadIdx.x % 32; + for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { + bool direction = ((lane_id & mask) == 0); + K opp_key = __shfl_xor_sync(0xffffffff, key, mask); + if ((acending == direction) == (key > opp_key)) { key = opp_key; } + } +} + +// +template +__device__ inline void warp_merge(K& key, V& val, bool acending = true, int group_size = 32) +{ + int lane_id = threadIdx.x % 32; + for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { + bool direction = ((lane_id & mask) == 0); + K opp_key = __shfl_xor_sync(0xffffffff, key, mask); + V opp_val = __shfl_xor_sync(0xffffffff, val, mask); + if ((acending == direction) == ((key > opp_key) || ((key == opp_key) && (val > opp_val)))) { + key = opp_key; + val = opp_val; + } + } +} + +// +template +__device__ inline void warp_sort(K& key, bool acending = true) +{ + int lane_id = threadIdx.x % 32; + for (int group_size = 2; group_size <= 32; group_size <<= 1) { + bool direction = ((lane_id & group_size) == 0); + if ((group_size == 32) && (!acending)) { direction = !direction; } + warp_merge(key, direction, group_size); + } +} + +// +template +__device__ inline void warp_sort(K& key, V& val, bool acending = true) +{ + int lane_id = threadIdx.x % 32; + for (int group_size = 2; group_size <= 32; group_size <<= 1) { + bool direction = ((lane_id & group_size) == 0); + if ((group_size == 32) && (!acending)) { direction = !direction; } + warp_merge(key, val, direction, group_size); + } +} + +// +template +__device__ inline void swap_vals(T& val1, T& val2) +{ + T val0 = val1; + val1 = val2; + val2 = val0; +} + +// +template +__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) +{ + if ((key1 > key2) || ((key1 == key2) && (val1 > val2))) { + swap_vals(key1, key2); + swap_vals(val1, val2); + return true; + } + return false; +} + +// +template +__device__ inline bool swap_if_needed(K& key1, K& key2) +{ + if (key1 > key2) { + swap_vals(key1, key2); + return true; + } + return false; +} + +// +template +__device__ inline T max_value_of(); +template <> +__device__ inline float max_value_of() +{ + return FLT_MAX; +} +template <> +__device__ inline uint32_t max_value_of() +{ + return ~0u; +} + +// +template +class BlockTopk { + public: + __device__ BlockTopk(uint32_t topk, K* ptr_kth_key) : _topk(topk), _lane_id(threadIdx.x % 32) + { +#pragma unroll + for (int i = 0; i < depth; i++) { + _key[i] = max_value_of(); + _val[i] = max_value_of(); + } + _nfill = 0; + _init_buf(); + _ptr_kth_key = ptr_kth_key; + if (_ptr_kth_key) { + _kth_key = _ptr_kth_key[0]; + } else { + _kth_key = max_value_of(); + } + // __syncthreads(); + } + + __device__ inline K key(int i) { return _key[i]; } + + __device__ inline V val(int i) { return _val[i]; } + + __device__ inline K kth_key() { return _kth_key; } + + __device__ void add(K key, V val) + { + uint32_t mask = __ballot_sync(0xffffffff, (key < _kth_key)); + if (mask == 0) { return; } + uint32_t nvalid = __popc(mask); + if (_buf_nvalid + nvalid > 32) { + _add(_buf_key, _buf_val); + _init_buf(); + if (_ptr_kth_key) { _kth_key = min(_kth_key, _ptr_kth_key[0]); } + } + _push_buf(key, val, mask, nvalid); + } + + __device__ void finalize() + { + if (_buf_nvalid > 0) { _add(_buf_key, _buf_val); } + _merge(); + } + + protected: + K _key[depth]; + V _val[depth]; + K* _ptr_kth_key; + K _kth_key; + uint32_t _nfill; // 0 <= _nfill <= depth + K _buf_key; + V _buf_val; + uint32_t _buf_nvalid; // 0 <= _buf_nvalid <= 32 + + const uint32_t _topk; + const uint32_t _lane_id; + + __device__ inline void _init_buf() + { + _buf_nvalid = 0; + _buf_key = max_value_of(); + _buf_val = max_value_of(); + } + + __device__ inline void _adjust_nfill() + { +#pragma unroll + for (int j = 1; j < depth; j++) { + if (_nfill == depth - j + 1) { + if (__shfl_sync(0xffffffff, _key[depth - j], 0) <= _kth_key) { return; } + _nfill = depth - j; + } + } + } + + __device__ inline void _push_buf(K key, V val, uint32_t mask, uint32_t nvalid) + { + int i = 0; + if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { + int j = _lane_id - _buf_nvalid; + while (j > 0) { + i = __ffs(mask) - 1; + mask ^= (0x1u << i); + j -= 1; + } + i = __ffs(mask) - 1; + } + K temp_key = __shfl_sync(0xffffffff, key, i); + K temp_val = __shfl_sync(0xffffffff, val, i); + if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { + _buf_key = temp_key; + _buf_val = temp_val; + } + _buf_nvalid += nvalid; + } + + __device__ inline void _add(K key, V val) + { + if (_nfill == 0) { + warp_sort(key, val); + _key[0] = key; + _val[0] = val; + } else if (_nfill == 1) { + warp_sort(key, val, false); + swap_if_needed(_key[0], key, _val[0], val); + if (depth > 1) { + _key[1] = key; + _val[1] = val; + warp_merge(_key[1], _val[1]); + } + warp_merge(_key[0], _val[0]); + } else if ((depth >= 2) && (_nfill == 2)) { + warp_sort(key, val, false); + swap_if_needed(_key[1], key, _val[1], val); + if (depth > 2) { + _key[2] = key; + _val[2] = val; + warp_merge(_key[2], _val[2]); + } + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if ((depth >= 3) && (_nfill == 3)) { + warp_sort(key, val, false); + swap_if_needed(_key[2], key, _val[2], val); + if (depth > 3) { + _key[3] = key; + _val[3] = val; + warp_merge(_key[3], _val[3]); + } + warp_merge(_key[2], _val[2], false); + swap_if_needed(_key[1], _key[2], _val[1], _val[2]); + warp_merge(_key[2], _val[2]); + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if ((depth >= 4) && (_nfill == 4)) { + warp_sort(key, val, false); + swap_if_needed(_key[3], key, _val[3], val); + warp_merge(_key[3], _val[3], false); + swap_if_needed(_key[2], _key[3], _val[2], _val[3]); + warp_merge(_key[3], _val[3]); + warp_merge(_key[2], _val[2], false); + swap_if_needed(_key[1], _key[2], _val[1], _val[2]); + warp_merge(_key[2], _val[2]); + warp_merge(_key[1], _val[1], false); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } + _nfill = min(_nfill + 1, depth); + if (_nfill == depth) { + _kth_key = + min(_kth_key, __shfl_sync(0xffffffff, _key[depth - 1], _topk - 1 - (depth - 1) * 32)); + } + } + + __device__ inline void _merge() + { + uint32_t warp_id = threadIdx.x / 32; + uint32_t num_warps = blockDim.x / 32; + K* smem_key = smemArray; + V* smem_val = (V*)(smem_key + (blockDim.x / 2) * depth); + for (int j = num_warps / 2; j > 0; j /= 2) { + __syncthreads(); + if ((j <= warp_id) && (warp_id < (j * 2))) { + uint32_t opp_tid = threadIdx.x - (j * 32); + smem_key[opp_tid] = _key[0]; + smem_val[opp_tid] = _val[0]; + if (depth >= 2) { + smem_key[opp_tid + (j * 32)] = _key[1]; + smem_val[opp_tid + (j * 32)] = _val[1]; + } + if (depth >= 3) { + smem_key[opp_tid + (j * 32) * 2] = _key[2]; + smem_val[opp_tid + (j * 32) * 2] = _val[2]; + } + if (depth >= 4) { + smem_key[opp_tid + (j * 32) * 3] = _key[3]; + smem_val[opp_tid + (j * 32) * 3] = _val[3]; + } + } + __syncthreads(); + if (warp_id < j) { + K key; + V val; + if (depth == 1) { + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[0], key, _val[0], val); + + warp_merge(_key[0], _val[0]); + } else if (depth == 2) { + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[0], key, _val[0], val); + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[1], key, _val[1], val); + + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if (depth == 3) { + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; + swap_if_needed(_key[1], key, _val[1], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[2], key, _val[2], val); + K _key_3_ = smem_key[threadIdx.x ^ 31]; + V _val_3_ = smem_val[threadIdx.x ^ 31]; + + swap_if_needed(_key[0], _key[2], _val[0], _val[2]); + swap_if_needed(_key[1], _key_3_, _val[1], _val_3_); + swap_if_needed(_key[2], _key_3_, _val[2], _val_3_); + warp_merge(_key[2], _val[2]); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } else if (depth == 4) { + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 3]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 3]; + swap_if_needed(_key[0], key, _val[0], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; + val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; + swap_if_needed(_key[1], key, _val[1], val); + key = smem_key[threadIdx.x ^ 31 + (j * 32)]; + val = smem_val[threadIdx.x ^ 31 + (j * 32)]; + swap_if_needed(_key[2], key, _val[2], val); + key = smem_key[threadIdx.x ^ 31]; + val = smem_val[threadIdx.x ^ 31]; + swap_if_needed(_key[3], key, _val[3], val); + + swap_if_needed(_key[0], _key[2], _val[0], _val[2]); + swap_if_needed(_key[1], _key[3], _val[1], _val[3]); + swap_if_needed(_key[2], _key[3], _val[2], _val[3]); + warp_merge(_key[3], _val[3]); + warp_merge(_key[2], _val[2]); + swap_if_needed(_key[0], _key[1], _val[0], _val[1]); + warp_merge(_key[1], _val[1]); + warp_merge(_key[0], _val[0]); + } + } + } + } +}; + +// +template +__device__ inline void update_approx_global_score(uint32_t topk, + K* my_score, + K* approx_global_score) +{ + if (!__any_sync(0xffffffff, (my_score[0] < approx_global_score[topk - 1]))) { return; } + if (topk <= 32) { + K score = max_value_of(); + if (threadIdx.x < topk) { score = approx_global_score[threadIdx.x]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + + warp_merge(my_score[0]); + if (threadIdx.x < topk) { atomicMin(approx_global_score + threadIdx.x, my_score[0]); } + } else if (topk <= 64) { + K score = max_value_of(); + if (threadIdx.x + 32 < topk) { score = approx_global_score[threadIdx.x + 32]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + swap_if_needed(my_score[1], score); + + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + if (threadIdx.x + 32 < topk) { atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); } + } else if (topk <= 96) { + K score = max_value_of(); + if (threadIdx.x + 64 < topk) { score = approx_global_score[threadIdx.x + 64]; } + warp_sort(score, false); + swap_if_needed(my_score[1], score); + score = approx_global_score[threadIdx.x + 32]; + warp_sort(score, false); + swap_if_needed(my_score[2], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + K my_score_3_ = score; + + swap_if_needed(my_score[0], my_score[2]); + swap_if_needed(my_score[1], my_score_3_); + swap_if_needed(my_score[2], my_score_3_); + warp_merge(my_score[2]); + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); + if (threadIdx.x + 64 < topk) { atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); } + } else if (topk <= 128) { + K score = max_value_of(); + if (threadIdx.x + 96 < topk) { score = approx_global_score[threadIdx.x + 96]; } + warp_sort(score, false); + swap_if_needed(my_score[0], score); + score = approx_global_score[threadIdx.x + 64]; + warp_sort(score, false); + swap_if_needed(my_score[1], score); + score = approx_global_score[threadIdx.x + 32]; + warp_sort(score, false); + swap_if_needed(my_score[2], score); + score = approx_global_score[threadIdx.x]; + warp_sort(score, false); + swap_if_needed(my_score[3], score); + + swap_if_needed(my_score[0], my_score[2]); + swap_if_needed(my_score[1], my_score[3]); + swap_if_needed(my_score[2], my_score[3]); + warp_merge(my_score[3]); + warp_merge(my_score[2]); + swap_if_needed(my_score[0], my_score[1]); + warp_merge(my_score[1]); + warp_merge(my_score[0]); + + atomicMin(approx_global_score + threadIdx.x, my_score[0]); + atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); + atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); + if (threadIdx.x + 96 < topk) { atomicMin(approx_global_score + threadIdx.x + 96, my_score[3]); } + } +} + +// +template +__device__ inline outDtype get_out_score(float score, distance::DistanceType metric) +{ + if (metric == distance::DistanceType::InnerProduct) { score = score / 2.0 - 1.0; } + if (sizeof(outDtype) == 2) { score = min(score, FP16_MAX); } + return (outDtype)score; +} + +// +// (*) Restrict the peak GPU occupancy up-to 50% by "__launch_bounds__(1024, 1)", +// as there were cases where performance dropped by a factor of two or more on V100 +// when the peak GPU occupancy was set to more than 50%. +// +template +__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( + uint32_t numDataset, + uint32_t dimDataset, + uint32_t numProbes, + uint32_t dimPq, + uint32_t sizeBatch, + uint32_t maxSamples, + distance::DistanceType metric, + codebook_gen typePqCenter, + uint32_t topk, + const float* clusterCenters, // [numClusters, dimDataset,] + const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or + // [numClusetrs, 1 << bitPq, lenPq,] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] + const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] + const float* _query, // [sizeBatch, dimDataset,] + const uint32_t* indexList, // [sizeBatch * numProbes] + float* _preCompScores, // [...] + float* _topkScores, // [sizeBatch, topk] + outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] + uint32_t* _topkIndex // [sizeBatch, numProbes, topk] +) +{ + const uint32_t lenPq = dimDataset / dimPq; + float* smem = smemArray; + + smemLutDtype* preCompScores = (smemLutDtype*)smem; + float* baseDiff = NULL; + if (preCompBaseDiff) { baseDiff = (float*)(preCompScores + (dimPq << bitPq)); } + bool manageLocalTopk = false; + if (_topkIndex != NULL) { manageLocalTopk = true; } + + uint32_t iBatch; + uint32_t iProbe; + if (indexList == NULL) { + // iBatch = blockIdx.x / numProbes; + // iProbe = blockIdx.x % numProbes; + iBatch = blockIdx.x % sizeBatch; + iProbe = blockIdx.x / sizeBatch; + } else { + iBatch = indexList[blockIdx.x] / numProbes; + iProbe = indexList[blockIdx.x] % numProbes; + } + if (iBatch >= sizeBatch || iProbe >= numProbes) return; + + const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); + const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + const float* query = _query + (dimDataset * iBatch); + outDtype* output; + uint32_t* topkIndex = NULL; + float* approx_global_score = NULL; + if (manageLocalTopk) { + // Store topk calculated distances to output (and its indices to topkIndex) + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); + approx_global_score = _topkScores + (topk * iBatch); + } else { + // Store all calculated distances to output + output = _output + (maxSamples * iBatch); + } + uint32_t label = clusterLabels[iProbe]; + const float* myClusterCenter = clusterCenters + (dimDataset * label); + const float* myPqCenters; + if (typePqCenter == codebook_gen::PER_SUBSPACE) { + myPqCenters = pqCenters; + } else { + myPqCenters = pqCenters + (lenPq << bitPq) * label; + } + + if (preCompBaseDiff) { + // Reduce computational complexity by pre-computing the difference + // between the cluster centroid and the query. + for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { + baseDiff[i] = query[i] - myClusterCenter[i]; + } + __syncthreads(); + } + + // Create a lookup table + for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { + uint32_t iPq = i >> bitPq; + uint32_t iCode = i & ((1 << bitPq) - 1); + float score = 0.0; + for (uint32_t j = 0; j < lenPq; j++) { + uint32_t k = j + (lenPq * iPq); + float diff; + if (preCompBaseDiff) { + diff = baseDiff[k]; + } else { + diff = query[k] - myClusterCenter[k]; + } + if (typePqCenter == codebook_gen::PER_SUBSPACE) { + diff -= myPqCenters[j + (lenPq * i)]; + } else { + diff -= myPqCenters[j + (lenPq * iCode)]; + } + score += diff * diff; + } + preCompScores[i] = (smemLutDtype)score; + } + + uint32_t iSampleBase = 0; + if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } + uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; + uint32_t nSamples32 = nSamples; + if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } + uint32_t iDatasetBase = clusterIndexPtr[label]; + + BlockTopk block_topk( + topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + __syncthreads(); + + // Compute a distance for each sample + for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { + float score = FLT_MAX; + if (i < nSamples) { + score = ivfpq_compute_score( + dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + } + if (!manageLocalTopk) { + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } + } else { + uint32_t val = i; + block_topk.add(score, val); + } + } + if (!manageLocalTopk) { return; } + block_topk.finalize(); + + // Output topk score and index + uint32_t warp_id = threadIdx.x / 32; + if (warp_id == 0) { + for (int j = 0; j < depth; j++) { + if (threadIdx.x + (32 * j) < topk) { + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); + topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; + } + } + } + + // Approximate update of global topk entries + if (warp_id == 0) { + float my_score[depth]; + for (int j = 0; j < depth; j++) { + my_score[j] = block_topk.key(j); + } + update_approx_global_score(topk, my_score, approx_global_score); + } +} + +// +template +__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( + uint32_t numDataset, + uint32_t dimDataset, + uint32_t numProbes, + uint32_t dimPq, + uint32_t sizeBatch, + uint32_t maxSamples, + distance::DistanceType metric, + codebook_gen typePqCenter, + uint32_t topk, + const float* clusterCenters, // [numClusters, dimDataset,] + const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or + // [numClusetrs, 1 << bitPq, lenPq,] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* clusterIndexPtr, // [numClusters + 1,] + const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] + const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] + const float* _query, // [sizeBatch, dimDataset,] + const uint32_t* indexList, // [sizeBatch * numProbes] + float* _preCompScores, // [..., dimPq << bitPq,] + float* _topkScores, // [sizeBatch, topk] + outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] + uint32_t* _topkIndex // [sizeBatch, numProbes, topk] +) +{ + const uint32_t lenPq = dimDataset / dimPq; + + float* preCompScores = _preCompScores + ((dimPq << bitPq) * blockIdx.x); + float* baseDiff = NULL; + if (preCompBaseDiff) { baseDiff = (float*)smemArray; } + bool manageLocalTopk = false; + if (_topkIndex != NULL) { manageLocalTopk = true; } + + for (int ib = blockIdx.x; ib < sizeBatch * numProbes; ib += gridDim.x) { + uint32_t iBatch; + uint32_t iProbe; + if (indexList == NULL) { + // iBatch = ib / numProbes; + // iProbe = ib % numProbes; + iBatch = ib % sizeBatch; + iProbe = ib / sizeBatch; + } else { + iBatch = indexList[ib] / numProbes; + iProbe = indexList[ib] % numProbes; + } + + const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); + const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); + const float* query = _query + (dimDataset * iBatch); + outDtype* output; + uint32_t* topkIndex = NULL; + float* approx_global_score = NULL; + if (manageLocalTopk) { + // Store topk calculated distances to output (and its indices to topkIndex) + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); + approx_global_score = _topkScores + (topk * iBatch); + } else { + // Store all calculated distances to output + output = _output + (maxSamples * iBatch); + } + uint32_t label = clusterLabels[iProbe]; + const float* myClusterCenter = clusterCenters + (dimDataset * label); + const float* myPqCenters; + if (typePqCenter == codebook_gen::PER_SUBSPACE) { + myPqCenters = pqCenters; + } else { + myPqCenters = pqCenters + (lenPq << bitPq) * label; + } + + if (preCompBaseDiff) { + // Reduce computational complexity by pre-computing the difference + // between the cluster centroid and the query. + for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { + baseDiff[i] = query[i] - myClusterCenter[i]; + } + __syncthreads(); + } + + // Create a lookup table + for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { + uint32_t iPq = i >> bitPq; + uint32_t iCode = i & ((1 << bitPq) - 1); + float score = 0.0; + for (uint32_t j = 0; j < lenPq; j++) { + uint32_t k = j + (lenPq * iPq); + float diff; + if (preCompBaseDiff) { + diff = baseDiff[k]; + } else { + diff = query[k] - myClusterCenter[k]; + } + if (typePqCenter == codebook_gen::PER_SUBSPACE) { + diff -= myPqCenters[j + (lenPq * i)]; + } else { + diff -= myPqCenters[j + (lenPq * iCode)]; + } + score += diff * diff; + } + preCompScores[i] = score; + } + + uint32_t iSampleBase = 0; + if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } + uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; + uint32_t nSamples32 = nSamples; + if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } + uint32_t iDatasetBase = clusterIndexPtr[label]; + + BlockTopk block_topk( + topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + __syncthreads(); + + // Compute a distance for each sample + for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { + float score = FLT_MAX; + if (i < nSamples) { + score = ivfpq_compute_score( + dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + } + if (!manageLocalTopk) { + if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } + } else { + uint32_t val = i; + block_topk.add(score, val); + } + } + __syncthreads(); + if (!manageLocalTopk) { + continue; // for (int ib ...) + } + block_topk.finalize(); + + // Output topk score and index + uint32_t warp_id = threadIdx.x / 32; + if (warp_id == 0) { + for (int j = 0; j < depth; j++) { + if (threadIdx.x + (32 * j) < topk) { + output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); + topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; + } + } + } + + // Approximate update of global topk entries + if (warp_id == 0) { + float my_score[depth]; + for (int j = 0; j < depth; j++) { + my_score[j] = block_topk.key(j); + } + update_approx_global_score(topk, my_score, approx_global_score); + } + __syncthreads(); + } +} + +// search +template +inline void ivfpq_search(const handle_t& handle, + cuannIvfPqDescriptor_t& desc, + uint32_t numQueries, + const float* clusterCenters, // [numDataset, dimRotDataset] + const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const uint32_t* originalNumbers, // [numDataset] + const uint32_t* cluster_offsets, // [numClusters + 1] + const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] + const float* query, // [numQueries, dimRotDataset] + uint64_t* topkNeighbors, // [numQueries, topK] + float* topkDistances, // [numQueries, topK] + void* workspace) +{ + RAFT_EXPECTS(numQueries <= desc->maxBatchSize, + "number of queries (%u) must be smaller the max batch size (%u)", + numQueries, + desc->maxBatchSize); + + uint32_t* clusterLabelsOut; // [maxBatchSize, numProbes] + uint32_t* indexList; // [maxBatchSize * numProbes] + uint32_t* indexListSorted; // [maxBatchSize * numProbes] + uint32_t* numSamples; // [maxBatchSize,] + void* cubWorkspace; // ... + uint32_t* chunkIndexPtr; // [maxBatchSize, numProbes] + uint32_t* topkSids; // [maxBatchsize, topk] + scoreDtype* similarity; // [maxBatchSize, maxSamples] or + // [maxBatchSize, numProbes, topk] + uint32_t* simTopkIndex; // [maxBatchSize, numProbes, topk] + float* topkScores; // [maxBatchSize, topk] + float* preCompScores = NULL; + void* topkWorkspace; + + clusterLabelsOut = (uint32_t*)workspace; + indexList = + (uint32_t*)((uint8_t*)clusterLabelsOut + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + indexListSorted = + (uint32_t*)((uint8_t*)indexList + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + numSamples = + (uint32_t*)((uint8_t*)indexListSorted + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + cubWorkspace = + (void*)((uint8_t*)numSamples + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize)); + chunkIndexPtr = (uint32_t*)((uint8_t*)cubWorkspace + desc->sizeCubWorkspace); + topkSids = + (uint32_t*)((uint8_t*)chunkIndexPtr + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); + similarity = + (scoreDtype*)((uint8_t*)topkSids + + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); + if (manage_local_topk(desc)) { + topkScores = + (float*)((uint8_t*)similarity + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * + desc->numProbes * desc->topK)); + simTopkIndex = (uint32_t*)((uint8_t*)topkScores + + Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK)); + preCompScores = + (float*)((uint8_t*)simTopkIndex + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * + desc->numProbes * desc->topK)); + } else { + topkScores = NULL; + simTopkIndex = NULL; + preCompScores = + (float*)((uint8_t*)similarity + + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); + } + topkWorkspace = + (void*)((uint8_t*)preCompScores + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * + desc->dimPq * (1 << desc->bitPq))); + + // + if (manage_local_topk(desc)) { + dim3 iksThreads(128, 1, 1); + dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); + ivfpq_init_topkScores<<>>( + topkScores, FLT_MAX, numQueries * desc->topK); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + } + + // + dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE + dim3 mcBlocks(numQueries, 1, 1); + ivfpq_make_chunk_index_ptr<<>>( + desc->numProbes, numQueries, cluster_offsets, clusterLabelsToProbe, chunkIndexPtr, numSamples); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + + if (numQueries * desc->numProbes > 256) { + // Sorting index by cluster number (label). + // The goal is to incrase the L2 cache hit rate to read the vectors + // of a cluster by processing the cluster at the same time as much as + // possible. + dim3 psThreads(128, 1, 1); + dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); + ivfpq_prep_sort<<>>(numQueries * desc->numProbes, + indexList); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + + int begin_bit = 0; + int end_bit = sizeof(uint32_t) * 8; + cub::DeviceRadixSort::SortPairs(cubWorkspace, + desc->sizeCubWorkspace, + clusterLabelsToProbe, + clusterLabelsOut, + indexList, + indexListSorted, + numQueries * desc->numProbes, + begin_bit, + end_bit, + handle.get_stream()); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + } else { + indexListSorted = NULL; + } + + // Select a GPU kernel for distance calculation +#define SET_KERNEL1(B, V, T, D) \ + do { \ + static_assert((B * V) % (sizeof(T) * 8) == 0); \ + kernel_no_basediff = ivfpq_compute_similarity; \ + kernel_fast = ivfpq_compute_similarity; \ + kernel_no_smem_lut = ivfpq_compute_similarity_no_smem_lut; \ + } while (0) + +#define SET_KERNEL2(B, M, D) \ + do { \ + RAFT_EXPECTS(desc->dimPq % M == 0, "dimPq must be a multiple of %u", M); \ + if (desc->dimPq % (M * 8) == 0) { \ + SET_KERNEL1(B, (M * 8), uint64_t, D); \ + } else if (desc->dimPq % (M * 4) == 0) { \ + SET_KERNEL1(B, (M * 4), uint32_t, D); \ + } else if (desc->dimPq % (M * 2) == 0) { \ + SET_KERNEL1(B, (M * 2), uint16_t, D); \ + } else if (desc->dimPq % (M * 1) == 0) { \ + SET_KERNEL1(B, (M * 1), uint8_t, D); \ + } \ + } while (0) + +#define SET_KERNEL3(D) \ + do { \ + switch (desc->bitPq) { \ + case 4: SET_KERNEL2(4, 2, D); break; \ + case 5: SET_KERNEL2(5, 8, D); break; \ + case 6: SET_KERNEL2(6, 4, D); break; \ + case 7: SET_KERNEL2(7, 8, D); break; \ + case 8: SET_KERNEL2(8, 1, D); break; \ + } \ + } while (0) + + typedef void (*kernel_t)(uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + distance::DistanceType, + codebook_gen, + uint32_t, + const float*, + const float*, + const uint8_t*, + const uint32_t*, + const uint32_t*, + const uint32_t*, + const float*, + const uint32_t*, + float*, + float*, + scoreDtype*, + uint32_t*); + kernel_t kernel_no_basediff; + kernel_t kernel_fast; + kernel_t kernel_no_smem_lut; + int depth = 1; + if (manage_local_topk(desc)) { depth = (desc->topK + 31) / 32; } + switch (depth) { + case 1: SET_KERNEL3(1); break; + case 2: SET_KERNEL3(2); break; + case 3: SET_KERNEL3(3); break; + case 4: SET_KERNEL3(4); break; + default: RAFT_FAIL("ivf_pq::search(k = %u): depth value is too big (%d)", desc->topK, depth); + } + RAFT_LOG_DEBUG("ivf_pq::search(k = %u, depth = %d, dim = %u/%u/%u)", + desc->topK, + depth, + desc->dimDataset, + desc->dimRotDataset, + desc->dimPq); + constexpr size_t thresholdSmem = 48 * 1024; + size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); + size_t sizeSmemBaseDiff = sizeof(float) * desc->dimRotDataset; + + uint32_t numCTAs = numQueries * desc->numProbes; + int numThreads = 1024; + // desc->preferredThreadBlockSize == 0 means using auto thread block size calculation mode + if (desc->preferredThreadBlockSize == 0) { + constexpr int minThreads = 256; + while (numThreads > minThreads) { + if (numCTAs < uint32_t(getMultiProcessorCount() * (1024 / (numThreads / 2)))) { break; } + if (handle.get_device_properties().sharedMemPerMultiprocessor * 2 / 3 < + sizeSmem * (1024 / (numThreads / 2))) { + break; + } + numThreads /= 2; + } + } else { + numThreads = desc->preferredThreadBlockSize; + } + size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); + sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); + + kernel_t kernel = kernel_no_basediff; + + bool kernel_no_basediff_available = true; + if (sizeSmem > thresholdSmem) { + cudaError_t cudaError = cudaFuncSetAttribute( + kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem); + if (cudaError != cudaSuccess) { + RAFT_EXPECTS( + cudaError == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); + kernel_no_basediff_available = false; + + // Use "kernel_no_smem_lut" which just uses small amount of shared memory. + kernel = kernel_no_smem_lut; + numThreads = 1024; + size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); + sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); + numCTAs = getMultiProcessorCount(); + } + } + if (kernel_no_basediff_available) { + bool kernel_fast_available = true; + if (sizeSmem + sizeSmemBaseDiff > thresholdSmem) { + cudaError_t cudaError = cudaFuncSetAttribute( + kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem + sizeSmemBaseDiff); + if (cudaError != cudaSuccess) { + RAFT_EXPECTS( + cudaError == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); + kernel_fast_available = false; + } + } + if (kernel_fast_available) { + int numBlocks_kernel_no_basediff = 0; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_no_basediff, kernel_no_basediff, numThreads, sizeSmem)); + + int numBlocks_kernel_fast = 0; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks_kernel_fast, kernel_fast, numThreads, sizeSmem + sizeSmemBaseDiff)); + + // Use "kernel_fast" only if GPU occupancy does not drop + if (numBlocks_kernel_no_basediff == numBlocks_kernel_fast) { + kernel = kernel_fast; + sizeSmem += sizeSmemBaseDiff; + } + } + } + dim3 ctaThreads(numThreads, 1, 1); + dim3 ctaBlocks(numCTAs, 1, 1); + kernel<<>>(desc->numDataset, + desc->dimRotDataset, + desc->numProbes, + desc->dimPq, + numQueries, + desc->maxSamples, + desc->metric, + desc->typePqCenter, + desc->topK, + clusterCenters, + pqCenters, + pqDataset, + cluster_offsets, + clusterLabelsToProbe, + chunkIndexPtr, + query, + indexListSorted, + preCompScores, + topkScores, + (scoreDtype*)similarity, + simTopkIndex); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + + // Select topk vectors for each query + if (simTopkIndex == NULL) { + _cuann_find_topk(handle, + desc->topK, + numQueries, + desc->maxSamples, + numSamples, + (scoreDtype*)similarity, + topkSids, + topkWorkspace); + } else { + _cuann_find_topk(handle, + desc->topK, + numQueries, + (desc->numProbes * desc->topK), + NULL, + (scoreDtype*)similarity, + topkSids, + topkWorkspace); + } +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif + + // + dim3 moThreads(128, 1, 1); + dim3 moBlocks((desc->topK + moThreads.x - 1) / moThreads.x, numQueries, 1); + ivfpq_make_outputs + <<>>(desc->numProbes, + desc->topK, + desc->maxSamples, + numQueries, + cluster_offsets, + originalNumbers, + clusterLabelsToProbe, + chunkIndexPtr, + (scoreDtype*)similarity, + simTopkIndex, + topkSids, + topkNeighbors, + topkDistances); +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) + handle.sync_stream(); +#endif +} + +} // namespace raft::spatial::knn::ivf_pq::detail diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh new file mode 100644 index 0000000000..2057b55cd0 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../ivf_pq_types.hpp" +#include "ann_utils.cuh" +#include "ivf_pq_legacy.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace raft::spatial::knn::ivf_pq::detail { + +using namespace raft::spatial::knn::detail; // NOLINT + +/** See raft::spatial::knn::ivf_pq::search docs */ +template +inline void search(const handle_t& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr) +{ + static_assert(std::is_same_v, + "Only uint64_t index output is supported at this time."); + common::nvtx::range fun_scope( + "ivf_pq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); + + RAFT_EXPECTS(params.n_probes > 0, + "n_probes (number of clusters to probe in the search) must be positive."); + auto n_probes = std::min(params.n_probes, index.n_lists()); + + auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16); + if (pool_guard) { + RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + auto& cuann_desc = const_cast(index.desc()); + + // set search parameters + ivf_pq::detail::cuannIvfPqSetSearchParameters(cuann_desc, n_probes, k); + ivf_pq::detail::cuannIvfPqSetSearchTuningParameters(cuann_desc, + params.internal_distance_dtype, + params.smem_lut_dtype, + params.preferred_thread_block_size); + // Maximum number of query vectors to search at the same time. + uint32_t batch_size = std::min(n_queries, 32768); + size_t max_ws_size = (size_t)2 * 1024 * 1024 * 1024; // 2 GiB + // Allocate memory for index + size_t ivf_pq_search_workspace_size; + ivf_pq::detail::cuannIvfPqSearch_bufferSize( + handle, cuann_desc, batch_size, max_ws_size, &ivf_pq_search_workspace_size); + rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, handle.get_stream(), mr); + + // finally, search! + ivf_pq::detail::cuannIvfPqSearch( + handle, cuann_desc, queries, n_queries, neighbors, distances, ivf_pq_search_ws_buf.data()); +} + +} // namespace raft::spatial::knn::ivf_pq::detail diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh index 9ae4fca131..abf5f4b01a 100644 --- a/cpp/include/raft/spatial/knn/ivf_pq.cuh +++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh @@ -13,6150 +13,182 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once -#include "detail/ann_kmeans_balanced.cuh" -#include "detail/ann_utils.cuh" +#include "detail/ivf_pq_build.cuh" +#include "detail/ivf_pq_search.cuh" +#include "ivf_pq_types.hpp" -#include #include -#include -#include -#include -#include -#include #include #include -/////////////////// -#include -#include -#include -#include - -////////////////// - namespace raft::spatial::knn::ivf_pq { /** - * - * - * - * - * - * fp_8bit + * @brief Build the index from the dataset for efficient search. + * + * NB: Currently, the following distance metrics are supported: + * - L2Expanded + * - L2Unexpanded + * - InnerProduct (TODO: incorrect distance atm) + * + * Usage example: + * @code{.cpp} + * using namespace raft::spatial::knn; + * // use default index parameters + * ivf_pq::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = ivf_pq::build(handle, index_params, dataset, N, D); + * // use default search parameters + * ivf_pq::search_params search_params; + * // search K nearest neighbours for each of the N queries + * ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param handle + * @param params configure the index building + * @param[in] dataset a managed pointer to a row-major matrix [n_rows, dim] + * @param n_rows the number of samples + * @param dim the dimensionality of the data + * + * @return the constructed ivf-pq index */ - -template -struct fp_8bit; - -template -__device__ __host__ fp_8bit __float2fp_8bit(const float v); -template -__device__ __host__ float __fp_8bit2float(const fp_8bit& v); - -template -struct fp_8bit { - uint8_t bitstring; - - __device__ __host__ fp_8bit(const uint8_t bs) { bitstring = bs; } - __device__ __host__ fp_8bit(const float fp) - { - bitstring = __float2fp_8bit(fp).bitstring; - } - __device__ __host__ fp_8bit& operator=(const float fp) - { - bitstring = __float2fp_8bit(fp).bitstring; - return *this; - } - - __device__ __host__ operator float() const { return __fp_8bit2float(*this); } -}; - -// Since __float_as_uint etc can not be used in host codes, -// these converters are needed for test. -union cvt_fp_32bit { - float fp; - uint32_t bs; -}; -union cvt_fp_16bit { - half fp; - uint16_t bs; -}; - -// Type converters -template -__device__ __host__ fp_8bit __float2fp_8bit(const float v) -{ - if (v < 1. / (1u << ((1u << (expBitLen - 1)) - 1))) - return fp_8bit{static_cast(0)}; - return fp_8bit{static_cast( - (cvt_fp_32bit{.fp = v}.bs + (((1u << (expBitLen - 1)) - 1) << 23) - 0x3f800000u) >> - (15 + expBitLen))}; -} - -template -__device__ __host__ float __fp_8bit2float(const fp_8bit& v) +template +inline auto build( + const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) + -> index { - return cvt_fp_32bit{.bs = ((v.bitstring << (15 + expBitLen)) + - (0x3f800000u | (0x00400000u >> (8 - expBitLen))) - - (((1u << (expBitLen - 1)) - 1) << 23))} - .fp; + return raft::spatial::knn::ivf_pq::detail::build(handle, params, dataset, n_rows, dim); } /** - * - * end of fp8bit - * + * @brief Build a new index containing the data of the original plus new extra vectors. + * + * Implementation note: + * The new data is clustered according to existing kmeans clusters, then the cluster + * centers are adjusted to match the newly labeled data. + * + * Usage example: + * @code{.cpp} + * using namespace raft::spatial::knn; + * ivf_pq::index_params index_params; + * index_params.add_data_on_build = false; // don't populate index on build + * index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training + * // train the index from a [N, D] dataset + * auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D); + * // fill the index with the data + * auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param handle + * @param orig_index original index + * @param[in] new_vectors a managed pointer to a row-major matrix [n_rows, index.dim()] + * @param[in] new_indices a managed pointer to a vector of indices [n_rows]. + * If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr` + * here to imply a continuous range `[0...n_rows)`. + * @param n_rows the number of samples + * + * @return the constructed extended ivf-pq index */ - -using namespace cub; - -// -extern __shared__ float smemArray[]; - -#define FP16_MAX 65504.0 - -/* CUANN PQ center type */ -typedef enum { - CUANN_PQ_CENTER_PER_SUBSPACE = 0, - CUANN_PQ_CENTER_PER_CLUSTER = 1, -} cuannPqCenter_t; - -/* IvfPq */ -struct cuannIvfPqDescriptor { - uint32_t numClusters; - uint32_t numDataset; - uint32_t dimDataset; - uint32_t dimDatasetExt; - uint32_t dimRotDataset; - uint32_t dimPq; - uint32_t bitPq; - distance::DistanceType metric; - cuannPqCenter_t typePqCenter; - cudaDataType_t dtypeDataset; - cudaDataType_t internalDistanceDtype; - cudaDataType_t smemLutDtype; - uint32_t indexVersion; - uint32_t maxClusterSize; - uint32_t lenPq; // dimRotDataset / dimPq - uint32_t numProbes; - uint32_t topK; - uint32_t maxQueries; - uint32_t maxBatchSize; - uint32_t maxSamples; - uint32_t* inclusiveSumSortedClusterSize; // [numClusters,] - float* sqsumClusters; // [numClusters,] - size_t sizeCubWorkspace; - uint32_t _numClustersSize0; // (*) urgent WA, need to be fixed - uint32_t preferredThreadBlockSize; - void* index_ptr; -}; -using cuannIvfPqDescriptor_t = - std::unique_ptr>; - -cuannIvfPqDescriptor_t cuannIvfPqCreateDescriptor() -{ - return cuannIvfPqDescriptor_t{[]() { - auto desc = new cuannIvfPqDescriptor{}; - desc->numClusters = 0; - desc->numDataset = 0; - desc->dimDataset = 0; - desc->dimDatasetExt = 0; - desc->dimRotDataset = 0; - desc->dimPq = 0; - desc->bitPq = 0; - desc->numProbes = 0; - desc->topK = 0; - desc->maxQueries = 0; - desc->maxBatchSize = 0; - desc->maxSamples = 0; - desc->inclusiveSumSortedClusterSize = nullptr; - desc->sqsumClusters = nullptr; - desc->index_ptr = nullptr; - return desc; - }(), - [](cuannIvfPqDescriptor* desc) { - if (desc->inclusiveSumSortedClusterSize != nullptr) { - free(desc->inclusiveSumSortedClusterSize); - } - if (desc->sqsumClusters != nullptr) { - RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->sqsumClusters)); - } - if (desc->index_ptr != nullptr) { - RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); - } - delete desc; - }}; -} - -// header of index -struct cuannIvfPqIndexHeader { - // (*) DO NOT CHANGE ORDER - size_t indexSize; - uint32_t version; - uint32_t numClusters; - uint32_t numDataset; - uint32_t dimDataset; - uint32_t dimPq; - uint32_t metric; - uint32_t maxClusterSize; - uint32_t dimRotDataset; - uint32_t bitPq; - uint32_t typePqCenter; - uint32_t dtypeDataset; - uint32_t dimDatasetExt; - uint32_t numDatasetAdded; - uint32_t _dummy[256 - 15]; -}; - -// -inline char* _cuann_get_dtype_string(cudaDataType_t dtype, char* string) -{ - if (dtype == CUDA_R_32F) - sprintf(string, "float (CUDA_R_32F)"); - else if (dtype == CUDA_R_16F) - sprintf(string, "half (CUDA_R_16F)"); - else if (dtype == CUDA_R_8U) - sprintf(string, "uint8 (CUDA_R_8U)"); - else if (dtype == CUDA_R_8I) - sprintf(string, "int8 (CUDA_R_8I)"); - else - sprintf(string, "unknown"); - return string; -} - -// copy -template -__global__ void kern_copy(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D divisor) -{ - uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint32_t iCol = gid % nCols; - uint32_t iRow = gid / nCols; - if (iRow >= nRows) return; - dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; -} - -// copy -template -inline void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D divisor) -{ - uint32_t nThreads = 128; - uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; - kern_copy<<>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor); -} - -template void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const float* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); -template void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const uint32_t* src, - uint32_t ldSrc, - uint8_t* dst, - uint32_t ldDst, - uint8_t divisor); -template void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const uint8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); -template void _cuann_copy(uint32_t nRows, - uint32_t nCols, - const int8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float divisor); - -// copy_fill -template -__global__ void kern_copy_fill(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D fillValue, - D divisor) -{ - uint32_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint32_t iCol = gid % ldDst; - uint32_t iRow = gid / ldDst; - if (iRow >= nRows) return; - if (iCol < nCols) { - dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor; - } else { - dst[iCol + (ldDst * iRow)] = fillValue; - } -} - -// copy_fill -template -inline void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const S* src, // [nRows, ldSrc] - uint32_t ldSrc, - D* dst, // [nRows, ldDst] - uint32_t ldDst, - D fillValue, - D divisor, - cudaStream_t stream) -{ - RAFT_EXPECTS(ldSrc >= nCols, "src leading dimension must be larger than nCols"); - RAFT_EXPECTS(ldDst >= nCols, "dist leading dimension must be larger than nCols"); - uint32_t nThreads = 128; - uint32_t nBlocks = ((nRows * ldDst) + nThreads - 1) / nThreads; - kern_copy_fill - <<>>(nRows, nCols, src, ldSrc, dst, ldDst, fillValue, divisor); -} - -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const float* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const uint8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const int8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); - -// a -= b -__global__ void kern_a_me_b(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - uint32_t ldA, - float* b // [nCols] -) -{ - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t iCol = gid % nCols; - uint64_t iRow = gid / nCols; - if (iRow >= nRows) return; - a[iCol + (ldA * iRow)] -= b[iCol]; -} - -// a -= b -inline void _cuann_a_me_b(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - uint32_t ldA, - float* b // [nCols] -) -{ - uint32_t nThreads = 128; - uint32_t nBlocks = ((nRows * nCols) + nThreads - 1) / nThreads; - kern_a_me_b<<>>(nRows, nCols, a, ldA, b); -} - -// normalize -__global__ void kern_normalize(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples // [nRows,] -) -{ - uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x); - if (iRow >= nRows) return; - if (numSamples != NULL and numSamples[iRow] < 1) return; - - float sqsum = 0.0; - for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { - float val = a[iCol + (nCols * iRow)]; - sqsum += val * val; - } - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); - sqsum = sqrt(sqsum); - for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) { - a[iCol + (nCols * iRow)] /= sqsum; - } -} - -// normalize -inline void _cuann_normalize(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples = nullptr // [nRows,] -) -{ - dim3 threads(32, 4, 1); // DO NOT CHANGE - dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1); - kern_normalize<<>>(nRows, nCols, a, numSamples); -} - -// divide -__global__ void kern_divide(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples // [nRows,] -) -{ - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t iRow = gid / nCols; - if (iRow >= nRows) return; - if (numSamples[iRow] == 0) return; - a[gid] /= numSamples[iRow]; -} - -// divide -inline void _cuann_divide(uint32_t nRows, - uint32_t nCols, - float* a, // [nRows, nCols] - const uint32_t* numSamples // [nRows,] -) -{ - dim3 threads(128, 1, 1); - dim3 blocks(((uint64_t)nRows * nCols + threads.x - 1) / threads.x, 1, 1); - kern_divide<<>>(nRows, nCols, a, numSamples); -} - -// -template -__global__ void kern_transpose_copy_3d(uint32_t num0, - uint32_t num1, - uint32_t num2, - D* dst, // [num2, ld1, ld0] - uint32_t ld0, - uint32_t ld1, - const S* src, // [...] - uint32_t stride0, - uint32_t stride1, - uint32_t stride2) -{ - uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - if (tid >= num0 * num1 * num2) return; - uint32_t i0 = tid % num0; - uint32_t i1 = (tid / num0) % num1; - uint32_t i2 = (tid / num0) / num1; - - dst[i0 + (ld0 * i1) + (ld0 * ld1 * i2)] = src[(stride0 * i0) + (stride1 * i1) + (stride2 * i2)]; -} - -// transpose_copy_3d -template -inline void _cuann_transpose_copy_3d(uint32_t num0, - uint32_t num1, - uint32_t num2, - D* dst, // [num2, ld1, ld0] - uint32_t ld0, - uint32_t ld1, - const S* src, // [...] - uint32_t stride0, - uint32_t stride1, - uint32_t stride2) -{ - uint32_t nThreads = 128; - uint32_t nBlocks = ((num0 * num1 * num2) + nThreads - 1) / nThreads; - kern_transpose_copy_3d - <<>>(num0, num1, num2, dst, ld0, ld1, src, stride0, stride1, stride2); -} - -template void _cuann_transpose_copy_3d(uint32_t num0, - uint32_t num1, - uint32_t num2, - float* dst, - uint32_t ld0, - uint32_t ld1, - const float* src, - uint32_t stride0, - uint32_t stride1, - uint32_t stride2); - -// -template -__global__ void kern_axpy(int num, T alpha, const T* x, T* y) -{ - uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - if (tid >= num) return; - y[tid] += alpha * x[tid]; -} - -// -template -inline void _cuann_axpy(int num, T alpha, const T* x, T* y) -{ - uint32_t nThreads = 128; - uint32_t nBlocks = (num + nThreads - 1) / nThreads; - kern_axpy<<>>(num, alpha, x, y); -} - -template void _cuann_axpy(int num, float alpha, const float* x, float* y); -template void _cuann_axpy(int num, uint32_t alpha, const uint32_t* x, uint32_t* y); - -// -template -T** _cuann_multi_device_malloc(int numDevices, - size_t numArrayElements, - const char* arrayName, - bool useCudaMalloc = false // If true, cudaMalloc() used, - // otherwise, cudaMallocManaged() used. -) -{ - int orgDevId; - RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); - T** arrays = (T**)malloc(sizeof(T*) * numDevices); - for (int devId = 0; devId < numDevices; devId++) { - RAFT_CUDA_TRY(cudaSetDevice(devId)); - if (useCudaMalloc) { - RAFT_CUDA_TRY(cudaMalloc(&(arrays[devId]), sizeof(T) * numArrayElements)); - } else { - RAFT_CUDA_TRY(cudaMallocManaged(&(arrays[devId]), sizeof(T) * numArrayElements)); - } - } - RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); - return arrays; -} - -// multi_device_free -template -inline void _cuann_multi_device_free(T** arrays, int numDevices) +template +inline auto extend(const handle_t& handle, + const index& orig_index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) -> index { - for (int devId = 0; devId < numDevices; devId++) { - RAFT_CUDA_TRY(cudaFree(arrays[devId])); - } - free(arrays); + return raft::spatial::knn::ivf_pq::detail::extend( + handle, orig_index, new_vectors, new_indices, n_rows); } -template void _cuann_multi_device_free(float** arrays, int numDevices); -template void _cuann_multi_device_free(uint32_t** arrays, int numDevices); -template void _cuann_multi_device_free(uint8_t** arrays, int numDevices); - /** - * End of utils - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * start of kmeans + * @brief Extend the index with the new data. + * * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param handle + * @param[inout] index + * @param[in] new_vectors a managed pointer to a row-major matrix [n_rows, index.dim()] + * @param[in] new_indices a managed pointer to a vector of indices [n_rows]. + * If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr` + * here to imply a continuous range `[0...n_rows)`. + * @param n_rows the number of samples */ - -// update kmeans centers -inline void _cuann_kmeans_update_centers(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - distance::DistanceType metric, - uint32_t* clusterSize, // [numCenters] - float* accumulatedCenters) -{ - auto stream = rmm::cuda_stream_default; - if (accumulatedCenters == NULL) { - // accumulate - detail::utils::memzero(centers, numCenters * dimCenters, stream); - detail::utils::memzero(clusterSize, numCenters, stream); - if (dtype == CUDA_R_32F) { - detail::utils::accumulate_into_selected( - numDataset, dimCenters, centers, clusterSize, (const float*)dataset, labels, stream); - } else if (dtype == CUDA_R_8U) { - detail::utils::accumulate_into_selected( - numDataset, dimCenters, centers, clusterSize, (const uint8_t*)dataset, labels, stream); - } else if (dtype == CUDA_R_8I) { - detail::utils::accumulate_into_selected( - numDataset, dimCenters, centers, clusterSize, (const int8_t*)dataset, labels, stream); - } - } else { - RAFT_CUDA_TRY(cudaMemcpy( - centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault)); - } - - if (metric == distance::DistanceType::InnerProduct) { - // normalize - _cuann_normalize(numCenters, dimCenters, centers, clusterSize); - } else { - // average - _cuann_divide(numCenters, dimCenters, centers, clusterSize); - } -} - -// -uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset) -{ - uint32_t chunk = (1 << 20); - if (chunk > (1 << 28) / numCenters) { - chunk = (1 << 28) / numCenters; - if (chunk > 31) { - chunk += 32; - chunk -= chunk % 64; - } else { - chunk = 64; - } - } - chunk = min(chunk, numDataset); - return chunk; -} - -// -inline size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters, - uint32_t dimCenters, - uint32_t numDataset) -{ - uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); - size_t size = 0; - // float *curDataset; // [chunk, dimCenters] - size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); - // void *bufDataset; // [chunk, dimCenters] - size += Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters); - // float *workspace; - size += Pow2<128>::roundUp(sizeof(float) * (numCenters + chunk + (numCenters * chunk))); - return size; -} - -// predict label of dataset -inline void _cuann_kmeans_predict(const handle_t& handle, - float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - distance::DistanceType metric, - bool isCenterSet, - void* _workspace, - float* tempCenters, // [numCenters, dimCenters] - uint32_t* clusterSize, // [numCenters,] - bool updateCenter) -{ - if (!isCenterSet) { - // If centers are not set, the labels will be determined randomly. - for (uint32_t i = 0; i < numDataset; i++) { - labels[i] = i % numCenters; - } - if (tempCenters != NULL && clusterSize != NULL) { - // update centers - _cuann_kmeans_update_centers(centers, - numCenters, - dimCenters, - dataset, - dtype, - numDataset, - labels, - metric, - clusterSize, - nullptr); - } - return; - } - - uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset); - void* workspace = _workspace; - if (_workspace == NULL) { - size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); - RAFT_CUDA_TRY(cudaMallocManaged(&workspace, sizeWorkspace)); - } - float* curDataset; // [chunk, dimCenters] - void* bufDataset; // [chunk, dimCenters] - // float* workspace_core; - curDataset = (float*)workspace; - bufDataset = - (void*)((uint8_t*)curDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); - // workspace_core = - // (float*)((uint8_t*)bufDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters)); - - auto stream = handle.get_stream(); - if (tempCenters != NULL && clusterSize != NULL) { - detail::utils::memzero(tempCenters, numCenters * dimCenters, stream); - detail::utils::memzero(clusterSize, numCenters, stream); - } - - cudaMemcpyKind kind; - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); - if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) { - kind = cudaMemcpyDeviceToDevice; - } else { - kind = cudaMemcpyHostToDevice; - } - - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, numCenters * chunk); - if (pool_guard) { - RAFT_LOG_DEBUG("_cuann_kmeans_predict: using pool memory resource with initial size %zu bytes", - pool_guard->pool_size()); - } - - for (uint64_t is = 0; is < numDataset; is += chunk) { - uint64_t ie = min(is + chunk, (uint64_t)numDataset); - uint32_t nDataset = ie - is; - - if (dtype == CUDA_R_32F) { - RAFT_CUDA_TRY(cudaMemcpy(bufDataset, - (float*)dataset + (is * dimCenters), - sizeof(float) * nDataset * dimCenters, - kind)); - } else if (dtype == CUDA_R_8U) { - RAFT_CUDA_TRY(cudaMemcpy(bufDataset, - (uint8_t*)dataset + (is * dimCenters), - sizeof(uint8_t) * nDataset * dimCenters, - kind)); - } else if (dtype == CUDA_R_8I) { - RAFT_CUDA_TRY(cudaMemcpy(bufDataset, - (int8_t*)dataset + (is * dimCenters), - sizeof(int8_t) * nDataset * dimCenters, - kind)); - } - - if (dtype == CUDA_R_32F) { -#if 0 - _cuann_copy(nDataset, dimCenters, - (const float*)bufDataset, dimCenters, - curDataset, dimCenters); -#else - // No need to copy when dtype is CUDA_R_32F - curDataset = (float*)bufDataset; -#endif - } else if (dtype == CUDA_R_8U) { - float divisor = 256.0; - _cuann_copy(nDataset, - dimCenters, - (const uint8_t*)bufDataset, - dimCenters, - curDataset, - dimCenters, - divisor); - } else if (dtype == CUDA_R_8I) { - float divisor = 128.0; - _cuann_copy(nDataset, - dimCenters, - (const int8_t*)bufDataset, - dimCenters, - curDataset, - dimCenters, - divisor); - } - - // predict - stream.synchronize(); - detail::kmeans::predict_float_core(handle, - centers, - numCenters, - dimCenters, - curDataset, - nDataset, - labels + is, - metric, - stream, - device_memory); - stream.synchronize(); - - if ((tempCenters != NULL) && (clusterSize != NULL)) { - // accumulate - detail::utils::accumulate_into_selected( - nDataset, dimCenters, tempCenters, clusterSize, curDataset, labels + is, stream); - } - } - - if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) { - _cuann_kmeans_update_centers(centers, - numCenters, - dimCenters, - dataset, - dtype, - numDataset, - labels, - metric, - clusterSize, - tempCenters); - } - - if (_workspace == NULL) { RAFT_CUDA_TRY(cudaFree(workspace)); } -} - -// -// predict label of dataset with multiple devices -// -inline void _cuann_kmeans_predict_MP(const handle_t& handle, - float* clusterCenters, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - distance::DistanceType metric, - bool isCenterSet, - uint32_t* clusterSize, // [numCenters] - bool updateCenter // If true, cluster Centers will be updated. -) -{ - int numDevices = 1; - // [numDevices][numCenters, dimCenters] - float** clusterCentersCopy = _cuann_multi_device_malloc( - numDevices, numCenters * dimCenters, "clusterCentersCopy", true /* use cudaMalloc() */); - - // [numDevices][numCenters, dimCenters] - float** clusterCentersMP = - _cuann_multi_device_malloc(numDevices, numCenters * dimCenters, "clusterCentersMP"); - - // [numDevices][numCenters] - uint32_t** clusterSizeMP = - _cuann_multi_device_malloc(numDevices, numCenters, "clusterSizeMP"); - - // [numDevices][...] - size_t sizePredictWorkspace = - _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset); - void** predictWorkspaceMP = (void**)_cuann_multi_device_malloc( - numDevices, sizePredictWorkspace, "predictWorkspaceMP"); - - int orgDevId; - RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); -#pragma omp parallel num_threads(numDevices) - { - int devId = omp_get_thread_num(); - RAFT_CUDA_TRY(cudaSetDevice(devId)); - RAFT_CUDA_TRY(cudaMemcpy(clusterCentersCopy[devId], - clusterCenters, - sizeof(float) * numCenters * dimCenters, - cudaMemcpyDefault)); - uint64_t d0 = (uint64_t)numDataset * (devId) / numDevices; - uint64_t d1 = (uint64_t)numDataset * (devId + 1) / numDevices; - uint64_t nDataset = d1 - d0; - void* ptrDataset; - if (dtype == CUDA_R_32F) { - ptrDataset = (void*)((float*)dataset + (uint64_t)dimCenters * d0); - } else if (dtype == CUDA_R_8U) { - ptrDataset = (void*)((uint8_t*)dataset + (uint64_t)dimCenters * d0); - } else if (dtype == CUDA_R_8I) { - ptrDataset = (void*)((int8_t*)dataset + (uint64_t)dimCenters * d0); - } - _cuann_kmeans_predict(handle, - clusterCentersCopy[devId], - numCenters, - dimCenters, - ptrDataset, - dtype, - nDataset, - labels + d0, - metric, - isCenterSet, - predictWorkspaceMP[devId], - clusterCentersMP[devId], - clusterSizeMP[devId], - false /* do not update centers */); - } - for (int devId = 0; devId < numDevices; devId++) { - // Barrier - RAFT_CUDA_TRY(cudaSetDevice(devId)); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - } - RAFT_CUDA_TRY(cudaSetDevice(orgDevId)); - auto stream = handle.get_stream(); - if (clusterSize != NULL) { - // Reduce results to main thread - detail::utils::memzero(clusterSize, numCenters, stream); - handle.sync_stream(stream); - for (int devId = 0; devId < numDevices; devId++) { - _cuann_axpy(numCenters, 1, clusterSizeMP[devId], clusterSize); - if (devId != orgDevId) { - _cuann_axpy( - numCenters * dimCenters, 1, clusterCentersMP[devId], clusterCentersMP[orgDevId]); - } - } - if (updateCenter) { - _cuann_kmeans_update_centers(clusterCenters, - numCenters, - dimCenters, - dataset, - dtype, - numDataset, - labels, - metric, - clusterSize, - clusterCentersMP[orgDevId]); - } - } - - _cuann_multi_device_free(clusterCentersCopy, numDevices); - _cuann_multi_device_free(clusterCentersMP, numDevices); - _cuann_multi_device_free(clusterSizeMP, numDevices); - _cuann_multi_device_free((uint8_t**)predictWorkspaceMP, numDevices); -} - -// predict labe of dataset (naive CPU version). -// (*) available only for prediction, but not for training. -inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - distance::DistanceType metric) -{ - float multiplier = 1.0; - if (dtype == CUDA_R_8U) { - multiplier = 1.0 / 256.0; - } else if (dtype == CUDA_R_8I) { - multiplier = 1.0 / 128.0; - } - for (uint32_t i = 0; i < numDataset; i++) { - float* vector = (float*)malloc(sizeof(float) * dimCenters); - for (uint32_t j = 0; j < dimCenters; j++) { - if (dtype == CUDA_R_32F) { - vector[j] = ((float*)dataset)[j + (dimCenters * i)]; - } else if (dtype == CUDA_R_8U) { - vector[j] = ((uint8_t*)dataset)[j + (dimCenters * i)]; - vector[j] *= multiplier; - } else if (dtype == CUDA_R_8I) { - vector[j] = ((int8_t*)dataset)[j + (dimCenters * i)]; - vector[j] *= multiplier; - } - } - float best_score; - for (uint32_t l = 0; l < numCenters; l++) { - float score = 0.0; - for (uint32_t j = 0; j < dimCenters; j++) { - if (metric == distance::DistanceType::InnerProduct) { - score -= vector[j] * centers[j + (dimCenters * l)]; - } else { - float diff = vector[j] - centers[j + (dimCenters * l)]; - score += diff * diff; - } - } - if ((l == 0) || (score < best_score)) { - labels[i] = l; - best_score = score; - } - } - free(vector); - } -} - -#define R_FACTOR 8 - -// -template -__global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* _dataset, // [numDataet, dimCenters] - uint32_t numDataset, - const uint32_t* labels, // [numDataset] - distance::DistanceType metric, - const uint32_t* clusterSize, // [numCenters] - float threshold, - uint32_t average, - uint32_t ofst, - uint32_t* count) +template +inline void extend(const handle_t& handle, + index* index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) { - const T* dataset = (const T*)_dataset; - float divisor = (float)_divisor; - uint32_t l = threadIdx.y + blockDim.y * blockIdx.y; - if (l >= numCenters) return; - if (clusterSize[l] > (int)(average * threshold)) return; - - uint32_t laneId = threadIdx.x % 32; - uint32_t i; - if (laneId == 0) { - do { - uint32_t old = atomicAdd(count, 1); - i = (ofst * (old + 1)) % numDataset; - } while (clusterSize[labels[i]] < average); - } - i = __shfl_sync(0xffffffff, i, 0); - uint32_t li = labels[i]; - float sqsum = 0.0; - for (uint32_t j = laneId; j < dimCenters; j += 32) { - float val = centers[j + (uint64_t)dimCenters * li] * (R_FACTOR - 1); - val += (float)(dataset[j + (uint64_t)dimCenters * i]) / divisor; - val /= R_FACTOR; - sqsum += val * val; - centers[j + (uint64_t)dimCenters * l] = val; - } - if (metric == distance::DistanceType::InnerProduct) { - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); - sqsum = sqrt(sqsum); - for (uint32_t j = laneId; j < dimCenters; j += 32) { - centers[j + ((uint64_t)dimCenters * l)] /= sqsum; - } - } + *index = extend(handle, *index, new_vectors, new_indices, n_rows); } /** - * end of kmeans - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * Start of topk + * @brief Search ANN using the constructed index. + * + * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example. + * + * Note, this function requires a temporary buffer to store intermediate results between cuda kernel + * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can + * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or + * eliminate entirely allocations happening within `search`: + * @code{.cpp} + * ... + * // Create a pooling memory resource with a pre-defined initial size. + * rmm::mr::pool_memory_resource mr( + * rmm::mr::get_current_device_resource(), 1024 * 1024); + * // use default search parameters + * ivf_pq::search_params search_params; + * // Use the same allocator across multiple searches to reduce the number of + * // cuda memory allocations + * ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr); + * ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr); + * ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr); + * ... + * @endcode + * The exact size of the temporary buffer depends on multiple factors and is an implementation + * detail. However, you can safely specify a small initial size for the memory pool, so that only a + * few allocations happen to grow it during the first invocations of the `search`. + * + * @tparam T data element type + * @tparam IdxT type of the indices + * + * @param handle + * @param params configure the search + * @param index ivf-pq constructed index + * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()] + * @param n_queries the batch size + * @param k the number of neighbors to find for each query. + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + * @param mr an optional memory resource to use across the searches (you can provide a large enough + * memory pool here to avoid memory allocations within search). */ - -// -#define NUM_THREADS 1024 // DO NOT CHANGE -#define STATE_BIT_LENGTH 8 // 0: state not used, 8: state used -#define MAX_VEC_LENGTH 8 // 1, 2, 4 or 8 - -// -__device__ inline uint32_t convert(uint32_t x) -{ - if (x & 0x80000000) { - return x ^ 0xffffffff; - } else { - return x ^ 0x80000000; - } -} - -// -struct u32_vector { - uint1 x1; - uint2 x2; - uint4 x4; - ulonglong4 x8; -}; - -// -template -__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) -{ - if (vecLen == 1) { - vec.x1 = ((uint1*)(x + i))[0]; - } else if (vecLen == 2) { - vec.x2 = ((uint2*)(x + i))[0]; - } else if (vecLen == 4) { - vec.x4 = ((uint4*)(x + i))[0]; - } else if (vecLen == 8) { - vec.x8 = ((ulonglong4*)(x + i))[0]; - } -} - -// -template -__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) -{ - uint32_t xi; - if (vecLen == 1) { - xi = convert(vec.x1.x); - } else if (vecLen == 2) { - if (i == 0) - xi = convert(vec.x2.x); - else - xi = convert(vec.x2.y); - } else if (vecLen == 4) { - if (i == 0) - xi = convert(vec.x4.x); - else if (i == 1) - xi = convert(vec.x4.y); - else if (i == 2) - xi = convert(vec.x4.z); - else - xi = convert(vec.x4.w); - } else if (vecLen == 8) { - if (i == 0) - xi = convert((uint32_t)(vec.x8.x & 0xffffffff)); - else if (i == 1) - xi = convert((uint32_t)(vec.x8.x >> 32)); - else if (i == 2) - xi = convert((uint32_t)(vec.x8.y & 0xffffffff)); - else if (i == 3) - xi = convert((uint32_t)(vec.x8.y >> 32)); - else if (i == 4) - xi = convert((uint32_t)(vec.x8.z & 0xffffffff)); - else if (i == 5) - xi = convert((uint32_t)(vec.x8.z >> 32)); - else if (i == 6) - xi = convert((uint32_t)(vec.x8.w & 0xffffffff)); - else - xi = convert((uint32_t)(vec.x8.w >> 32)); - } - return xi; -} - -// -template -__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ - void kern_topk_cg_11(uint32_t topk, - uint32_t size_batch, - uint32_t max_len_x, - uint32_t* len_x, // [size_batch,] - const uint32_t* _x, // [size_batch, max_len_x,] - uint8_t* _state, // [size_batch, max_len_x / 8,] - uint32_t* _labels, // [size_batch, topk,] - uint32_t* _count // [size_batch, 5 * 1024,] - ) -{ - __shared__ uint32_t smem[2048 + 6]; - uint32_t* best_index = &(smem[2048]); - uint32_t* best_csum = &(smem[2048 + 3]); - typedef BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp_storage; - namespace cg = cooperative_groups; - cg::grid_group grid = cg::this_grid(); - uint32_t i_batch = blockIdx.y; - if (i_batch >= size_batch) return; - - uint32_t nx; - if (len_x == NULL) { - nx = max_len_x; - } else { - nx = len_x[i_batch]; - } - - uint32_t num_threads = blockDim_x * gridDim.x; - uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); - - const uint32_t* x = _x + (max_len_x * i_batch); - uint8_t* state = NULL; - if (stateBitLen == 8) { - uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; - uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; - state = _state + (numState_perThread * num_threads * i_batch); - } - uint32_t* labels = _labels + (topk * i_batch); - if (threadIdx.x < 6) { smem[2048 + threadIdx.x] = 0; } - - uint32_t* count = _count + (5 * 1024 * i_batch); - for (int i = thread_id; i < 5 * 1024; i += num_threads) { - count[i] = 0; - } - cg::sync(grid); - - uint32_t count_below = 0; - uint32_t threshold = 0; - - // - // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". - // - for (int j = 0; j < 2; j += 1) { - uint32_t shift = (21 - 11 * j); - for (int i = threadIdx.x; i < 2048; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector x_vec; - load_u32_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - uint32_t xi = get_element_from_u32_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold) >> shift; // 0 <= k - if (k >= 2048) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 2048) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - for (int i = threadIdx.x; i < 2048; i += blockDim_x) { - if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } - } - cg::sync(grid); - - constexpr int n_data = 2048 / blockDim_x; - uint32_t csum[n_data]; -#pragma unroll - for (int i = 0; i < n_data; i++) { - csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; - } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - -#pragma unroll - for (int i = n_data - 1; i >= 0; i--) { - if (count_below + csum[i] >= topk) continue; - uint32_t index = i + (n_data * threadIdx.x); - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[i]); - break; - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += (best_index[j] << shift); - } - - { - uint32_t j = 2; - for (int i = threadIdx.x; i < 1024; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector x_vec; - load_u32_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u32_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold); // 0 <= k - if (k >= 1024) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 1024) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - for (int i = threadIdx.x; i < 1024; i += blockDim_x) { - if (smem[i] > 0) { atomicAdd(&(count[i + (2048 * j)]), smem[i]); } - } - cg::sync(grid); - - constexpr int n_data = 1024 / blockDim_x; - uint32_t csum[n_data]; -#pragma unroll - for (int i = 0; i < n_data; i++) { - csum[i] = count[i + (n_data * threadIdx.x) + (2048 * j)]; - } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - -#pragma unroll - for (int i = n_data - 1; i >= 0; i--) { - if (count_below + csum[i] >= topk) continue; - uint32_t index = i + (n_data * threadIdx.x); - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[i]); - break; - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += best_index[j]; - } - - // - // Get labels that satifies "x[i] < threshold". - // - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector vec; - load_u32_vector(vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u32_vector(vec, u); - if (xi < threshold) { - labels[atomicAdd(&count[0], 1)] = ivu; - } else if ((xi == threshold) && (count_below + count[2048] < topk)) { - if (count_below + atomicAdd(&count[2048], 1) < topk) { - labels[atomicAdd(&count[0], 1)] = ivu; - } - } - } - } - } -} - -// -template -__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ - void kern_topk_cta_11(uint32_t topk, - uint32_t size_batch, - uint32_t max_len_x, - uint32_t* len_x, // [size_batch, max_len_x,] - const uint32_t* _x, // [size_batch, max_len_x,] - uint8_t* _state, // [size_batch, max_len_x / 8,] - uint32_t* _labels // [size_batch, topk,] - ) -{ - __shared__ uint32_t smem[2048 + 3 + 3 + 2]; - uint32_t* best_index = &(smem[2048]); - uint32_t* best_csum = &(smem[2048 + 3]); - uint32_t* count = &(smem[2048 + 6]); - typedef BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp_storage; - uint32_t i_batch = blockIdx.y; - if (i_batch >= size_batch) return; - - uint32_t nx; - if (len_x == NULL) { - nx = max_len_x; - } else { - nx = len_x[i_batch]; - } - - uint32_t num_threads = blockDim_x; - uint32_t thread_id = threadIdx.x; - - const uint32_t* x = _x + (max_len_x * i_batch); - uint8_t* state = NULL; - if (stateBitLen == 8) { - uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; - uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; - state = _state + (numState_perThread * num_threads * i_batch); - } - uint32_t* labels = _labels + (topk * i_batch); - if (threadIdx.x < 8) { smem[2048 + threadIdx.x] = 0; } - - uint32_t count_below = 0; - uint32_t threshold = 0; - - // - // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". - // - for (int j = 0; j < 2; j += 1) { - uint32_t shift = (21 - 11 * j); - for (int i = threadIdx.x; i < 2048; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector x_vec; - load_u32_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - uint32_t xi = get_element_from_u32_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold) >> shift; // 0 <= k - if (k >= 2048) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 2048) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - constexpr int n_data = 2048 / blockDim_x; - uint32_t csum[n_data]; -#pragma unroll - for (int i = 0; i < n_data; i++) { - csum[i] = smem[i + (n_data * threadIdx.x)]; - } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - -#pragma unroll - for (int i = n_data - 1; i >= 0; i--) { - if (count_below + csum[i] > topk) continue; - uint32_t index = i + (n_data * threadIdx.x); - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[i]); - break; - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += (best_index[j] << shift); - if (count_below == topk) break; - } - - { - uint32_t j = 2; - for (int i = threadIdx.x; i < 1024; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector x_vec; - load_u32_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u32_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold); // 0 <= k - if (k >= 1024) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 1024) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - constexpr int n_data = 1024 / blockDim_x; - uint32_t csum[n_data]; -#pragma unroll - for (int i = 0; i < n_data; i++) { - csum[i] = smem[i + (n_data * threadIdx.x)]; - } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - -#pragma unroll - for (int i = n_data - 1; i >= 0; i--) { - if (count_below + csum[i] > topk) continue; - uint32_t index = i + (n_data * threadIdx.x); - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[i]); - break; - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += best_index[j]; - } - - // - // Get labels that satifies "x[i] < threshold". - // - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u32_vector vec; - load_u32_vector(vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u32_vector(vec, u); - if (xi < threshold) { - labels[atomicAdd(&count[0], 1)] = ivu; - } else if ((xi == threshold) && (count_below + count[1] < topk)) { - if (count_below + atomicAdd(&count[1], 1) < topk) { - labels[atomicAdd(&count[0], 1)] = ivu; - } - } - } - } - } -} - -// -__device__ inline uint16_t convert(uint16_t x) -{ - if (x & 0x8000) { - return x ^ 0xffff; - } else { - return x ^ 0x8000; - } -} - -// -struct u16_vector { - ushort1 x1; - ushort2 x2; - ushort4 x4; - uint4 x8; -}; - -// -template -__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) -{ - if (vecLen == 1) { - vec.x1 = ((ushort1*)(x + i))[0]; - } else if (vecLen == 2) { - vec.x2 = ((ushort2*)(x + i))[0]; - } else if (vecLen == 4) { - vec.x4 = ((ushort4*)(x + i))[0]; - } else if (vecLen == 8) { - vec.x8 = ((uint4*)(x + i))[0]; - } -} - -// -template -__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) -{ - uint16_t xi; - if (vecLen == 1) { - xi = convert(vec.x1.x); - } else if (vecLen == 2) { - if (i == 0) - xi = convert(vec.x2.x); - else - xi = convert(vec.x2.y); - } else if (vecLen == 4) { - if (i == 0) - xi = convert(vec.x4.x); - else if (i == 1) - xi = convert(vec.x4.y); - else if (i == 2) - xi = convert(vec.x4.z); - else - xi = convert(vec.x4.w); - } else if (vecLen == 8) { - if (i == 0) - xi = convert((uint16_t)(vec.x8.x & 0xffff)); - else if (i == 1) - xi = convert((uint16_t)(vec.x8.x >> 16)); - else if (i == 2) - xi = convert((uint16_t)(vec.x8.y & 0xffff)); - else if (i == 3) - xi = convert((uint16_t)(vec.x8.y >> 16)); - else if (i == 4) - xi = convert((uint16_t)(vec.x8.z & 0xffff)); - else if (i == 5) - xi = convert((uint16_t)(vec.x8.z >> 16)); - else if (i == 6) - xi = convert((uint16_t)(vec.x8.w & 0xffff)); - else - xi = convert((uint16_t)(vec.x8.w >> 16)); - } - return xi; -} - -// -template -__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ - void kern_topk_cg_8(uint32_t topk, - uint32_t size_batch, - uint32_t max_len_x, - uint32_t* len_x, // [size_batch,] - const uint16_t* _x, // [size_batch, max_len_x,] - uint8_t* _state, // [size_batch, max_len_x / 8,] - uint32_t* _labels, // [size_batch, topk,] - uint32_t* _count // [size_batch, 5 * 1024,] - ) -{ - __shared__ uint32_t smem[256 + 4]; - uint32_t* best_index = &(smem[256]); - uint32_t* best_csum = &(smem[256 + 2]); - typedef BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp_storage; - namespace cg = cooperative_groups; - cg::grid_group grid = cg::this_grid(); - uint32_t i_batch = blockIdx.y; - if (i_batch >= size_batch) return; - - uint32_t nx; - if (len_x == NULL) { - nx = max_len_x; - } else { - nx = len_x[i_batch]; - } - - uint32_t num_threads = blockDim_x * gridDim.x; - uint32_t thread_id = threadIdx.x + (blockDim_x * blockIdx.x); - - const uint16_t* x = _x + (max_len_x * i_batch); - uint8_t* state = NULL; - if (stateBitLen == 8) { - uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; - uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; - state = _state + (numState_perThread * num_threads * i_batch); - } - uint32_t* labels = _labels + (topk * i_batch); - if (threadIdx.x < 4) { smem[256 + threadIdx.x] = 0; } - - uint32_t* count = _count + (2 * 256 * i_batch); - for (int i = thread_id; i < 2 * 256; i += num_threads) { - count[i] = 0; - } - cg::sync(grid); - - uint32_t count_below = 0; - uint32_t threshold = 0; - - // - // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". - // - for (int j = 0; j < 2; j += 1) { - uint32_t shift = (8 - 8 * j); - for (int i = threadIdx.x; i < 256; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u16_vector x_vec; - load_u16_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - uint32_t xi = get_element_from_u16_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold) >> shift; // 0 <= k - if (k >= 256) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 256) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - for (int i = threadIdx.x; i < 256; i += blockDim_x) { - if (smem[i] > 0) { atomicAdd(&(count[i + (256 * j)]), smem[i]); } - } - cg::sync(grid); - - uint32_t csum[1]; - csum[0] = 0; - if (threadIdx.x < 256) { csum[0] = count[threadIdx.x + (256 * j)]; } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - - if (threadIdx.x < 256) { - if (count_below + csum[0] < topk) { - uint32_t index = threadIdx.x; - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[0]); - } - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += (best_index[j] << shift); - } - - // - // Get labels that satifies "x[i] < threshold". - // - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u16_vector vec; - load_u16_vector(vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u16_vector(vec, u); - if (xi < threshold) { - labels[atomicAdd(&count[0], 1)] = ivu; - } else if ((xi == threshold) && (count_below + count[256] < topk)) { - if (count_below + atomicAdd(&count[256], 1) < topk) { - labels[atomicAdd(&count[0], 1)] = ivu; - } - } - } - } - } -} - -// -template -__launch_bounds__(NUM_THREADS, 1024 / NUM_THREADS) __global__ - void kern_topk_cta_8(uint32_t topk, - uint32_t size_batch, - uint32_t max_len_x, - uint32_t* len_x, // [size_batch, max_len_x,] - const uint16_t* _x, // [size_batch, max_len_x,] - uint8_t* _state, // [size_batch, max_len_x / 8,] - uint32_t* _labels // [size_batch, topk,] - ) -{ - __shared__ uint32_t smem[256 + 6]; - uint32_t* best_index = &(smem[256]); - uint32_t* best_csum = &(smem[256 + 2]); - uint32_t* count = &(smem[256 + 4]); - typedef BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp_storage; - uint32_t i_batch = blockIdx.y; - if (i_batch >= size_batch) return; - - uint32_t nx; - if (len_x == NULL) { - nx = max_len_x; - } else { - nx = len_x[i_batch]; - } - - uint32_t num_threads = blockDim_x; - uint32_t thread_id = threadIdx.x; - - const uint16_t* x = _x + (max_len_x * i_batch); - uint8_t* state = NULL; - if (stateBitLen == 8) { - uint32_t numSample_perThread = (max_len_x + num_threads - 1) / num_threads; - uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; - state = _state + (numState_perThread * num_threads * i_batch); - } - uint32_t* labels = _labels + (topk * i_batch); - if (threadIdx.x < 6) { smem[256 + threadIdx.x] = 0; } - - uint32_t count_below = 0; - uint32_t threshold = 0; - - // - // Search for the maximum threshold that satisfies "(x < threshold).sum() < topk". - // - for (int j = 0; j < 2; j += 1) { - uint32_t shift = (8 - 8 * j); - for (int i = threadIdx.x; i < 256; i += blockDim_x) { - smem[i] = 0; - } - __syncthreads(); - - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8 && j > 0) { iState = state[thread_id + (num_threads * ii)]; } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u16_vector x_vec; - load_u16_vector(x_vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - uint32_t xi = get_element_from_u16_vector(x_vec, u); - if (xi < threshold) { - if (stateBitLen == 8) { - labels[atomicAdd(&count[0], 1)] = ivu; - iState |= mask; - } - } else { - uint32_t k = (xi - threshold) >> shift; // 0 <= k - if (k >= 256) { - if (stateBitLen == 8) { iState |= mask; } - } else if (k + 1 < 256) { - atomicAdd(&(smem[k + 1]), 1); - } - } - } - } - if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; } - } - __syncthreads(); - - uint32_t csum[1]; - if (threadIdx.x < 256) { csum[0] = smem[threadIdx.x]; } - BlockScanT(temp_storage).InclusiveSum(csum, csum); - - if (threadIdx.x < 256) { - if (count_below + csum[0] < topk) { - uint32_t index = threadIdx.x; - atomicMax(&(best_index[j]), index); - atomicMax(&(best_csum[j]), csum[0]); - } - } - __syncthreads(); - - count_below += best_csum[j]; - threshold += (best_index[j] << shift); - if (count_below == topk) break; - } - - // - // Get labels that satifies "x[i] < threshold". - // - int ii = 0; - for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { - uint8_t iState = 0; - if (stateBitLen == 8) { - iState = state[thread_id + (num_threads * ii)]; - if (iState == (uint8_t)0xff) continue; - } -#pragma unroll - for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) { - int iv = i + (num_threads * v); - if (iv >= nx) break; - - struct u16_vector vec; - load_u16_vector(vec, x, iv); -#pragma unroll - for (int u = 0; u < vecLen; u++) { - int ivu = iv + u; - if (ivu >= nx) break; - - uint8_t mask = (uint8_t)0x1 << (v + u); - if ((stateBitLen == 8) && (iState & mask)) continue; - uint32_t xi = get_element_from_u16_vector(vec, u); - if (xi < threshold) { - labels[atomicAdd(&count[0], 1)] = ivu; - } else if ((xi == threshold) && (count_below + count[1] < topk)) { - if (count_below + atomicAdd(&count[1], 1) < topk) { - labels[atomicAdd(&count[0], 1)] = ivu; - } - } - } - } - } -} - -// -__global__ void _sort_topk_prep(uint32_t sizeBatch, - uint32_t topK, - uint32_t maxSamples, - const uint32_t* labels, // [sizeBatch, topK] - const float* samples, // [sizeBatch, maxSamples] - int* offsets, // [sizeBatch + 1] - float* outputs // [sizeBatch, topK] -) -{ - uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - if (tid < sizeBatch + 1) { offsets[tid] = tid * topK; } - if (tid < sizeBatch * topK) { - uint32_t label = labels[tid]; - uint32_t iBatch = tid / topK; - float value = samples[label + (maxSamples * iBatch)]; - outputs[tid] = value; - } -} - -// -inline size_t _cuann_find_topk_bufferSize(const handle_t& handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - cudaDataType_t sampleDtype = CUDA_R_32F) -{ - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - static_assert(stateBitLen == 0 || stateBitLen == 8); - - size_t workspaceSize = 0; - // count - if (sampleDtype == CUDA_R_16F) { - workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); - } else { - workspaceSize += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); - } - // state - if (stateBitLen == 8) { - // (*) Each thread has at least one array element for state - uint32_t numBlocks_perBatch = (getMultiProcessorCount() * 2 + sizeBatch) / sizeBatch; - - uint32_t numThreads_perBatch = numThreads * numBlocks_perBatch; - uint32_t numSample_perThread = (maxSamples + numThreads_perBatch - 1) / numThreads_perBatch; - uint32_t numState_perThread = (numSample_perThread + stateBitLen - 1) / stateBitLen; - workspaceSize += - Pow2<128>::roundUp(sizeof(uint8_t) * numState_perThread * numThreads_perBatch * sizeBatch); - } - - size_t workspaceSize2 = 0; - // offsets - workspaceSize2 += Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1)); - // keys_in, keys_out, values_out - workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); - workspaceSize2 += Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK); - workspaceSize2 += Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK); - // cub_ws - size_t cub_ws_size = 0; - cub::DeviceSegmentedRadixSort::SortPairs(NULL, - cub_ws_size, - (float*)NULL, - (float*)NULL, - (uint32_t*)NULL, - (uint32_t*)NULL, - sizeBatch * topK, - sizeBatch, - (int*)NULL, - (int*)NULL); - workspaceSize2 += Pow2<128>::roundUp(cub_ws_size); - workspaceSize = max(workspaceSize, workspaceSize2); - - return workspaceSize; -} - -// -int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH) -{ - int vecLen = min(maxVecLen, MAX_VEC_LENGTH); - while ((maxSamples % vecLen) != 0) { - vecLen /= 2; - } - return vecLen; -} - -// -inline void _cuann_find_topk(const handle_t& handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - uint32_t* numSamples, // [sizeBatch,] - const float* samples, // [sizeBatch, maxSamples,] - uint32_t* labels, // [sizeBatch, topK,] - void* workspace, - bool sort = false) -{ - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - static_assert(stateBitLen == 0 || stateBitLen == 8); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - RAFT_CUDA_TRY( - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); -#endif - - // Limit the maximum value of vecLen to 4. In the case of FP32, - // setting vecLen = 8 in cg_kernel causes too much register usage. - int vecLen = _get_vecLen(maxSamples, 4); - void* cg_kernel; - if (vecLen == 4) { - cg_kernel = (void*)kern_topk_cg_11; - } else if (vecLen == 2) { - cg_kernel = (void*)kern_topk_cg_11; - } else if (vecLen == 1) { - cg_kernel = (void*)kern_topk_cg_11; - } - - int numBlocksPerSm_topk; - size_t dynamicSMemSize = 0; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm_topk, cg_kernel, numThreads, dynamicSMemSize)); - int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); - int numBlocks = - min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); - numBlocks_perBatch = max(numBlocks / sizeBatch, 1); - if (maxSamples <= numThreads * 10) { - // When number of sample is small, using multiple thread-blocks does not - // improve performance, in which case cta_kernel is used. Tentatively, - // "numThreads * 10" is used as the threshold, but this may be better - // determined by auto-tuning, etc. - numBlocks_perBatch = 1; - } - uint32_t* count = (uint32_t*)workspace; - uint8_t* state = NULL; - if (stateBitLen == 8) { - state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 5 * 1024); - } - - dim3 threads(numThreads, 1, 1); - dim3 blocks(numBlocks_perBatch, sizeBatch, 1); - if (numBlocks_perBatch <= 1) { - void (*cta_kernel)( - uint32_t, uint32_t, uint32_t, uint32_t*, const uint32_t*, uint8_t*, uint32_t*); - int vecLen = _get_vecLen(maxSamples); - if (vecLen == 8) { - cta_kernel = kern_topk_cta_11; - } else if (vecLen == 4) { - cta_kernel = kern_topk_cta_11; - } else if (vecLen == 2) { - cta_kernel = kern_topk_cta_11; - } else if (vecLen == 1) { - cta_kernel = kern_topk_cta_11; - } - cta_kernel<<>>( - topK, sizeBatch, maxSamples, numSamples, (const uint32_t*)samples, state, labels); - } else { - void* args[9]; - args[0] = {&(topK)}; - args[1] = {&(sizeBatch)}; - args[2] = {&(maxSamples)}; - args[3] = {&(numSamples)}; - args[4] = {&(samples)}; - args[5] = {&(state)}; - args[6] = {&(labels)}; - args[7] = {&(count)}; - args[8] = {nullptr}; - RAFT_CUDA_TRY( - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); - } - if (!sort) { return; } - - // offsets: [sizeBatch + 1] - // keys_in, keys_out, values_out: [sizeBatch, topK] - int* offsets = (int*)workspace; - float* keys_in = (float*)((uint8_t*)offsets + Pow2<128>::roundUp(sizeof(int) * (sizeBatch + 1))); - float* keys_out = - (float*)((uint8_t*)keys_in + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); - uint32_t* values_out = - (uint32_t*)((uint8_t*)keys_out + Pow2<128>::roundUp(sizeof(float) * sizeBatch * topK)); - void* cub_ws = - (void*)((uint8_t*)values_out + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * topK)); - - dim3 stpThreads(128, 1, 1); - dim3 stpBlocks((max(sizeBatch + 1, sizeBatch * topK) + stpThreads.x - 1) / stpThreads.x, 1, 1); - _sort_topk_prep<<>>( - sizeBatch, topK, maxSamples, labels, samples, offsets, keys_in); - - size_t cub_ws_size = 0; - cub::DeviceSegmentedRadixSort::SortPairs(NULL, - cub_ws_size, - keys_in, - keys_out, - labels, - values_out, - sizeBatch * topK, - sizeBatch, - offsets, - offsets + 1); - - cub::DeviceSegmentedRadixSort::SortPairs(cub_ws, - cub_ws_size, - keys_in, - keys_out, - labels, - values_out, - sizeBatch * topK, - sizeBatch, - offsets, - offsets + 1, - (int)0, - (int)(sizeof(float) * 8), - handle.get_stream()); - - RAFT_CUDA_TRY(cudaMemcpyAsync(labels, - values_out, - sizeof(uint32_t) * sizeBatch * topK, - cudaMemcpyDeviceToDevice, - handle.get_stream())); -} - -// -inline void _cuann_find_topk(const handle_t& handle, - uint32_t topK, - uint32_t sizeBatch, - uint32_t maxSamples, - uint32_t* numSamples, // [sizeBatch,] - const half* samples, // [sizeBatch, maxSamples,] - uint32_t* labels, // [sizeBatch, topK,] - void* workspace, - bool sort = false) -{ - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - static_assert(stateBitLen == 0 || stateBitLen == 8); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - RAFT_CUDA_TRY( - cudaMemsetAsync(labels, 0xff, sizeof(uint32_t) * sizeBatch * topK, handle.get_stream())); -#endif - - int vecLen = _get_vecLen(maxSamples); - void* cg_kernel; - if (vecLen == 8) { - cg_kernel = (void*)kern_topk_cg_8; - } else if (vecLen == 4) { - cg_kernel = (void*)kern_topk_cg_8; - } else if (vecLen == 2) { - cg_kernel = (void*)kern_topk_cg_8; - } else if (vecLen == 1) { - cg_kernel = (void*)kern_topk_cg_8; - } - - int numBlocksPerSm_topk; - RAFT_CUDA_TRY( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm_topk, cg_kernel, numThreads, 0)); - int numBlocks_perBatch = (maxSamples + (numThreads * vecLen) - 1) / (numThreads * vecLen); - int numBlocks = - min(numBlocks_perBatch * sizeBatch, getMultiProcessorCount() * numBlocksPerSm_topk); - numBlocks_perBatch = max(numBlocks / sizeBatch, 1); - if (maxSamples <= numThreads * 10) { - // When number of sample is small, using multiple thread-blocks does not - // improve performance, in which case cta_kernel is used. Tentatively, - // "numThreads * 10" is used as the threshold, but this may be better - // determined by auto-tuning, etc. - numBlocks_perBatch = 1; - } - uint32_t* count = (uint32_t*)workspace; - uint8_t* state = NULL; - if (stateBitLen == 8) { - state = (uint8_t*)count + Pow2<128>::roundUp(sizeof(uint32_t) * sizeBatch * 2 * 256); - } - - dim3 threads(numThreads, 1, 1); - dim3 blocks(numBlocks_perBatch, sizeBatch, 1); - if (numBlocks_perBatch <= 1) { - void (*cta_kernel)( - uint32_t, uint32_t, uint32_t, uint32_t*, const uint16_t*, uint8_t*, uint32_t*); - int vecLen = _get_vecLen(maxSamples); - if (vecLen == 8) { - cta_kernel = kern_topk_cta_8; - } else if (vecLen == 4) { - cta_kernel = kern_topk_cta_8; - } else if (vecLen == 2) { - cta_kernel = kern_topk_cta_8; - } else if (vecLen == 1) { - cta_kernel = kern_topk_cta_8; - } - cta_kernel<<>>( - topK, sizeBatch, maxSamples, numSamples, (const uint16_t*)samples, state, labels); - } else { - void* args[9]; - args[0] = {&(topK)}; - args[1] = {&(sizeBatch)}; - args[2] = {&(maxSamples)}; - args[3] = {&(numSamples)}; - args[4] = {&(samples)}; - args[5] = {&(state)}; - args[6] = {&(labels)}; - args[7] = {&(count)}; - args[8] = {nullptr}; - RAFT_CUDA_TRY( - cudaLaunchCooperativeKernel((void*)cg_kernel, blocks, threads, args, 0, handle.get_stream())); - } -} - -/** - * - * End of topk - * - * - * - * - * - * - * - * - * - * - * Start of ivfpq - */ - -// -inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc); - -// search -template -inline void ivfpq_search(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - uint32_t numQueries, - const float* clusterCenters, // [numDataset, dimDataset] - const float* pqCenters, // [dimPq, 256, lenPq] - const uint8_t* pqDataset, // [numDataset, dimPq] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* cluster_offsets, // [numClusters + 1] - const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] - const float* query, // [dimDataset] - uint64_t* topKNeighbors, // [topK] - float* topKDistances, // [topK] - void* workspace); - -inline void ivfpq_encode(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] -); - -// -bool manage_local_topk(cuannIvfPqDescriptor_t& desc); -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads); - -// -__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] - float initValue, - uint32_t num); - -// -__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList); - -// -__global__ void ivfpq_make_chunk_index_ptr( - uint32_t numProbes, - uint32_t sizeBatch, - const uint32_t* cluster_offsets, // [numClusters + 1,] - const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] - uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] - uint32_t* numSamples // [sizeBatch,] -); - -// -template -__global__ void ivfpq_make_outputs(uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const scoreDtype* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - -// -__device__ inline uint32_t warp_scan(uint32_t x) -{ - uint32_t y; - y = __shfl_up_sync(0xffffffff, x, 1); - if (threadIdx.x % 32 >= 1) x += y; - y = __shfl_up_sync(0xffffffff, x, 2); - if (threadIdx.x % 32 >= 2) x += y; - y = __shfl_up_sync(0xffffffff, x, 4); - if (threadIdx.x % 32 >= 4) x += y; - y = __shfl_up_sync(0xffffffff, x, 8); - if (threadIdx.x % 32 >= 8) x += y; - y = __shfl_up_sync(0xffffffff, x, 16); - if (threadIdx.x % 32 >= 16) x += y; - return x; -} - -// -__device__ inline uint32_t thread_block_scan(uint32_t x, uint32_t* smem) -{ - x = warp_scan(x); - __syncthreads(); - if (threadIdx.x % 32 == 31) { smem[threadIdx.x / 32] = x; } - __syncthreads(); - if (threadIdx.x < 32) { smem[threadIdx.x] = warp_scan(smem[threadIdx.x]); } - __syncthreads(); - if (threadIdx.x / 32 > 0) { x += smem[threadIdx.x / 32 - 1]; } - __syncthreads(); - return x; -} - -// -__global__ void ivfpq_make_chunk_index_ptr( - uint32_t numProbes, - uint32_t sizeBatch, - const uint32_t* cluster_offsets, // [numClusters + 1,] - const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] - uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] - uint32_t* numSamples // [sizeBatch,] -) -{ - __shared__ uint32_t smem_temp[32]; - __shared__ uint32_t smem_base[2]; - - uint32_t iBatch = blockIdx.x; - if (iBatch >= sizeBatch) return; - const uint32_t* clusterLabelsToProbe = _clusterLabelsToProbe + (numProbes * iBatch); - uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); - - // - uint32_t j_end = (numProbes + 1024 - 1) / 1024; - for (uint32_t j = 0; j < j_end; j++) { - uint32_t i = threadIdx.x + (1024 * j); - uint32_t val = 0; - if (i < numProbes) { - uint32_t l = clusterLabelsToProbe[i]; - val = cluster_offsets[l + 1] - cluster_offsets[l]; - } - val = thread_block_scan(val, smem_temp); - - if (i < numProbes) { - if (j > 0) { val += smem_base[(j - 1) & 0x1]; } - chunkIndexPtr[i] = val; - if (i == numProbes - 1) { numSamples[iBatch] = val; } - } - - if ((j < j_end - 1) && (threadIdx.x == 1023)) { smem_base[j & 0x1] = val; } - } -} - -// -__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] - float initValue, - uint32_t num) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= num) return; - topkScores[i] = initValue; -} - -// -__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= numElement) return; - indexList[i] = i; -} - -// -__device__ inline void ivfpq_get_id_dataset(uint32_t iSample, - uint32_t numProbes, - const uint32_t* clusterIndexPtr, // [numClusters + 1,] - const uint32_t* clusterLabels, // [numProbes,] - const uint32_t* chunkIndexPtr, // [numProbes,] - uint32_t& iChunk, - uint32_t& label, - uint32_t& iDataset) -{ - uint32_t minChunk = 0; - uint32_t maxChunk = numProbes - 1; - iChunk = (minChunk + maxChunk) / 2; - while (minChunk < maxChunk) { - if (iSample >= chunkIndexPtr[iChunk]) { - minChunk = iChunk + 1; - } else { - maxChunk = iChunk; - } - iChunk = (minChunk + maxChunk) / 2; - } - - label = clusterLabels[iChunk]; - uint32_t iSampleInChunk = iSample; - if (iChunk > 0) { iSampleInChunk -= chunkIndexPtr[iChunk - 1]; } - iDataset = iSampleInChunk + clusterIndexPtr[label]; -} - -// -template -__global__ void ivfpq_make_outputs(uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const scoreDtype* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= topk) return; - uint32_t iBatch = blockIdx.y; - if (iBatch >= sizeBatch) return; - - uint32_t iSample = topkSampleIds[i + (topk * iBatch)]; - if (scoreTopkIndex == NULL) { - // 0 <= iSample < maxSamples - topkScores[i + (topk * iBatch)] = scores[iSample + (maxSamples * iBatch)]; - uint32_t iChunk; - uint32_t label; - uint32_t iDataset; - ivfpq_get_id_dataset(iSample, - numProbes, - clusterIndexPtr, - clusterLabels + (numProbes * iBatch), - chunkIndexPtr + (numProbes * iBatch), - iChunk, - label, - iDataset); - topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; - } else { - // 0 <= iSample < (numProbes * topk) - topkScores[i + (topk * iBatch)] = scores[iSample + ((numProbes * topk) * iBatch)]; - uint32_t iDataset = scoreTopkIndex[iSample + ((numProbes * topk) * iBatch)]; - topkNeighbors[i + (topk * iBatch)] = originalNumbers[iDataset]; - } -} - -// -inline bool manage_local_topk(cuannIvfPqDescriptor_t& desc) -{ - int depth = (desc->topK + 31) / 32; - if (depth > 4) { return false; } - if (desc->numProbes < 16) { return false; } - if (desc->maxBatchSize * desc->numProbes < 256) { return false; } - return true; -} - -// -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads) -{ - if (manage_local_topk(desc)) { - int topk_32 = (desc->topK + 31) / 32; - return (sizeof(float) + sizeof(uint32_t)) * (numThreads / 2) * topk_32; - } - return 0; -} - -// return workspace size -inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc) -{ - size_t size = 0; - // clusterLabelsOut [maxBatchSize, numProbes] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); - // indexList [maxBatchSize * numProbes] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); - // indexListSorted [maxBatchSize * numProbes] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); - // numSamples [maxBatchSize,] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize); - // cubWorkspace - void* d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - uint32_t* d_keys_in = NULL; - uint32_t* d_keys_out = NULL; - uint32_t* d_values_in = NULL; - uint32_t* d_values_out = NULL; - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - desc->maxBatchSize * desc->numProbes); - desc->sizeCubWorkspace = Pow2<128>::roundUp(temp_storage_bytes); - size += desc->sizeCubWorkspace; - // chunkIndexPtr [maxBatchSize, numProbes] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes); - // topkSids [maxBatchSize, topk] - size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK); - // similarity - size_t unit_size = sizeof(float); - if (desc->internalDistanceDtype == CUDA_R_16F) { unit_size = sizeof(half); } - if (manage_local_topk(desc)) { - // [matBatchSize, numProbes, topK] - size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->numProbes * desc->topK); - } else { - // [matBatchSize, maxSamples] - size += Pow2<128>::roundUp(unit_size * desc->maxBatchSize * desc->maxSamples); - } - // simTopkIndex - if (manage_local_topk(desc)) { - // [matBatchSize, numProbes, topk] - size += - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); - } - // topkScores - if (manage_local_topk(desc)) { - // [maxBatchSize, topk] - size += Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK); - } - // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] - size += - Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); - // topkWorkspace - if (manage_local_topk(desc)) { - size += _cuann_find_topk_bufferSize(handle, - desc->topK, - desc->maxBatchSize, - desc->numProbes * desc->topK, - desc->internalDistanceDtype); - } else { - size += _cuann_find_topk_bufferSize( - handle, desc->topK, desc->maxBatchSize, desc->maxSamples, desc->internalDistanceDtype); - } - return size; -} - -// -__device__ __host__ inline void ivfpq_encode_core( - uint32_t ldDataset, uint32_t dimPq, uint32_t bitPq, const uint32_t* label, uint8_t* output) -{ - for (uint32_t j = 0; j < dimPq; j++) { - uint8_t code = label[(ldDataset * j)]; - if (bitPq == 8) { - uint8_t* ptrOutput = output + j; - ptrOutput[0] = code; - } else if (bitPq == 7) { - uint8_t* ptrOutput = output + 7 * (j / 8); - if (j % 8 == 0) { - ptrOutput[0] |= code; - } else if (j % 8 == 1) { - ptrOutput[0] |= code << 7; - ptrOutput[1] |= code >> 1; - } else if (j % 8 == 2) { - ptrOutput[1] |= code << 6; - ptrOutput[2] |= code >> 2; - } else if (j % 8 == 3) { - ptrOutput[2] |= code << 5; - ptrOutput[3] |= code >> 3; - } else if (j % 8 == 4) { - ptrOutput[3] |= code << 4; - ptrOutput[4] |= code >> 4; - } else if (j % 8 == 5) { - ptrOutput[4] |= code << 3; - ptrOutput[5] |= code >> 5; - } else if (j % 8 == 6) { - ptrOutput[5] |= code << 2; - ptrOutput[6] |= code >> 6; - } else if (j % 8 == 7) { - ptrOutput[6] |= code << 1; - } - } else if (bitPq == 6) { - uint8_t* ptrOutput = output + 3 * (j / 4); - if (j % 4 == 0) { - ptrOutput[0] |= code; - } else if (j % 4 == 1) { - ptrOutput[0] |= code << 6; - ptrOutput[1] |= code >> 2; - } else if (j % 4 == 2) { - ptrOutput[1] |= code << 4; - ptrOutput[2] |= code >> 4; - } else if (j % 4 == 3) { - ptrOutput[2] |= code << 2; - } - } else if (bitPq == 5) { - uint8_t* ptrOutput = output + 5 * (j / 8); - if (j % 8 == 0) { - ptrOutput[0] |= code; - } else if (j % 8 == 1) { - ptrOutput[0] |= code << 5; - ptrOutput[1] |= code >> 3; - } else if (j % 8 == 2) { - ptrOutput[1] |= code << 2; - } else if (j % 8 == 3) { - ptrOutput[1] |= code << 7; - ptrOutput[2] |= code >> 1; - } else if (j % 8 == 4) { - ptrOutput[2] |= code << 4; - ptrOutput[3] |= code >> 4; - } else if (j % 8 == 5) { - ptrOutput[3] |= code << 1; - } else if (j % 8 == 6) { - ptrOutput[3] |= code << 6; - ptrOutput[4] |= code >> 2; - } else if (j % 8 == 7) { - ptrOutput[4] |= code << 3; - } - } else if (bitPq == 4) { - uint8_t* ptrOutput = output + (j / 2); - if (j % 2 == 0) { - ptrOutput[0] |= code; - } else { - ptrOutput[0] |= code << 4; - } - } - } -} - -// -__global__ void ivfpq_encode_kernel(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] -) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= numDataset) return; - ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); -} - -// -inline void ivfpq_encode(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] -) -{ -#if 1 - // GPU - dim3 iekThreads(128, 1, 1); - dim3 iekBlocks((numDataset + iekThreads.x - 1) / iekThreads.x, 1, 1); - ivfpq_encode_kernel<<>>( - numDataset, ldDataset, dimPq, bitPq, label, output); -#else - // CPU - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - for (uint32_t i = 0; i < numDataset; i++) { - ivfpq_encode_core(ldDataset, dimPq, bitPq, label + i, output + (dimPq * bitPq / 8) * i); - } -#endif -} - -// -template __global__ void ivfpq_make_outputs( - uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const float* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - -// -template __global__ void ivfpq_make_outputs( - uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const half* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - -/** - * End of ivfpq - * - * - * - * - */ - -inline void cuannIvfPqSetIndexParameters( - cuannIvfPqDescriptor_t& desc, - const uint32_t numClusters, /* Number of clusters */ - const uint32_t numDataset, /* Number of dataset entries */ - const uint32_t dimDataset, /* Dimension of each entry */ - const uint32_t dimPq, /* Dimension of each entry after product quantization */ - const uint32_t bitPq, /* Bit length of PQ */ - const distance::DistanceType metric, - const cuannPqCenter_t typePqCenter); - -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - distance::DistanceType* metric, - cuannPqCenter_t* typePqCenter); - -inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, - size_t* size /* bytes of dataset index */); - -inline size_t _cuann_getIndexSize_clusterCenters(cuannIvfPqDescriptor_t& desc) -{ - // [numClusters, dimDatasetExt] - return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimDatasetExt); -} - -inline size_t _cuann_getIndexSize_pqCenters(cuannIvfPqDescriptor_t& desc) -{ - size_t size_base = sizeof(float) * (1 << desc->bitPq) * desc->lenPq; - if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - // [dimPq, 1 << bitPq, lenPq] - return Pow2<128>::roundUp(desc->dimPq * size_base); - } else { - // [numClusters, 1 << bitPq, lenPq] - return Pow2<128>::roundUp(desc->numClusters * size_base); - } -} - -inline size_t _cuann_getIndexSize_pqDataset(cuannIvfPqDescriptor_t& desc) -{ - // [numDataset, dimPq * bitPq / 8] - return Pow2<128>::roundUp(sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8); -} - -inline size_t _cuann_getIndexSize_originalNumbers(cuannIvfPqDescriptor_t& desc) -{ - // [numDataset,] - return Pow2<128>::roundUp(sizeof(uint32_t) * desc->numDataset); -} - -inline size_t _cuann_getIndexSize_indexPtr(cuannIvfPqDescriptor_t& desc) -{ - // [numClusters + 1,] - return Pow2<128>::roundUp(sizeof(uint32_t) * (desc->numClusters + 1)); -} - -inline size_t _cuann_getIndexSize_rotationMatrix(cuannIvfPqDescriptor_t& desc) -{ - // [dimDataset, dimRotDataset] - return Pow2<128>::roundUp(sizeof(float) * desc->dimDataset * desc->dimRotDataset); -} - -inline size_t _cuann_getIndexSize_clusterRotCenters(cuannIvfPqDescriptor_t& desc) -{ - // [numClusters, dimRotDataset] - return Pow2<128>::roundUp(sizeof(float) * desc->numClusters * desc->dimRotDataset); -} - -inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, - struct cuannIvfPqIndexHeader** header, - float** clusterCenters, // [numClusters, dimDatasetExt] - float** pqCenters, // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t** pqDataset, // [numDataset, dimPq * bitPq / 8] - uint32_t** originalNumbers, // [numDataset] - uint32_t** cluster_offsets, // [numClusters + 1] - float** rotationMatrix, // [dimDataset, dimRotDataset] - float** clusterRotCenters // [numClusters, dimRotDataset] -) -{ - *header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); - *clusterCenters = (float*)((uint8_t*)(*header) + sizeof(struct cuannIvfPqIndexHeader)); - *pqCenters = (float*)((uint8_t*)(*clusterCenters) + _cuann_getIndexSize_clusterCenters(desc)); - *pqDataset = (uint8_t*)((uint8_t*)(*pqCenters) + _cuann_getIndexSize_pqCenters(desc)); - *originalNumbers = (uint32_t*)((uint8_t*)(*pqDataset) + _cuann_getIndexSize_pqDataset(desc)); - *cluster_offsets = - (uint32_t*)((uint8_t*)(*originalNumbers) + _cuann_getIndexSize_originalNumbers(desc)); - *rotationMatrix = (float*)((uint8_t*)(*cluster_offsets) + _cuann_getIndexSize_indexPtr(desc)); - *clusterRotCenters = - (float*)((uint8_t*)(*rotationMatrix) + _cuann_getIndexSize_rotationMatrix(desc)); -} - -__global__ void kern_get_cluster_size(uint32_t numClusters, - const uint32_t* cluster_offsets, // [numClusters + 1,] - uint32_t* clusterSize // [numClusters,] -) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= numClusters) return; - clusterSize[i] = cluster_offsets[i + 1] - cluster_offsets[i]; -} - -template -int descending(const void* a, const void* b) -{ - T valA = ((T*)a)[0]; - T valB = ((T*)b)[0]; - if (valA > valB) return -1; - if (valA < valB) return 1; - return 0; -} - -// (*) This is temporal. Need to be removed in future. -inline void _cuann_get_random_norm_vector(int len, float* vector) -{ - float sqsum = 0.0; - for (int i = 0; i < len; i++) { - vector[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; - sqsum += vector[i] * vector[i]; - } - float norm = sqrt(sqsum); - for (int i = 0; i < len; i++) { - vector[i] /= norm; - } -} - -inline void _cuann_get_inclusiveSumSortedClusterSize( - cuannIvfPqDescriptor_t& desc, - const uint32_t* cluster_offsets, // [numClusters + 1] - float* clusterCenters, // [numClusters, dimDatasetExt] - uint32_t** output // [numClusters] -) -{ - // [CPU] - if (*output != nullptr) { free(*output); } - *output = (uint32_t*)malloc(sizeof(uint32_t) * desc->numClusters); - desc->_numClustersSize0 = 0; - for (uint32_t i = 0; i < desc->numClusters; i++) { - (*output)[i] = cluster_offsets[i + 1] - cluster_offsets[i]; - if ((*output)[i] > 0) continue; - - desc->_numClustersSize0 += 1; - // Work-around for clusters of size 0 - _cuann_get_random_norm_vector(desc->dimDatasetExt, clusterCenters + (desc->dimDatasetExt * i)); - } - RAFT_LOG_DEBUG("Number of clusters of size zero: %d", desc->_numClustersSize0); - // sort - qsort(*output, desc->numClusters, sizeof(uint32_t), descending); - // scan - for (uint32_t i = 1; i < desc->numClusters; i++) { - (*output)[i] += (*output)[i - 1]; - } - RAFT_EXPECTS((*output)[desc->numClusters - 1] == desc->numDataset, "cluster sizes do not add up"); -} - -inline void _cuann_get_sqsumClusters(cuannIvfPqDescriptor_t& desc, - const float* clusterCenters, // [numClusters, dimDataset,] - float** output // [numClusters,] -) -{ - if (*output != NULL) { RAFT_CUDA_TRY(cudaFree(*output)); } - RAFT_CUDA_TRY(cudaMallocManaged(output, sizeof(float) * desc->numClusters)); - switch (detail::utils::check_pointer_residency(clusterCenters, *output)) { - case detail::utils::pointer_residency::device_only: - case detail::utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("_cuann_get_sqsumClusters: not all pointers are available on the device."); - } - rmm::cuda_stream_default.synchronize(); - detail::utils::dots_along_rows( - desc->numClusters, desc->dimDataset, clusterCenters, *output, rmm::cuda_stream_default); - rmm::cuda_stream_default.synchronize(); -} - -// -template -T _cuann_dot(int n, const T* x, int incX, const T* y, int incY) -{ - T val = 0; - for (int i = 0; i < n; i++) { - val += x[incX * i] * y[incY * i]; - } - return val; -} - -// -template -T _cuann_dot(int n, const X* x, int incX, const Y* y, int incY, T divisor = 1) -{ - T val = 0; - for (int i = 0; i < n; i++) { - val += (T)(x[incX * i]) * (T)(y[incY * i]) / divisor; - } - return val; -} - -// -template -T _cuann_rand() -{ - return (T)rand() / RAND_MAX; -} - -// make rotation matrix -inline void _cuann_make_rotation_matrix(uint32_t nRows, - uint32_t nCols, - uint32_t lenPq, - bool randomRotation, - float* rotationMatrix // [nRows, nCols] -) -{ - RAFT_EXPECTS( - nRows >= nCols, "number of rows (%u) must be larger than number or cols (%u)", nRows, nCols); - RAFT_EXPECTS( - nRows % lenPq == 0, "number of rows (%u) must be a multiple of lenPq (%u)", nRows, lenPq); - - if (randomRotation) { - RAFT_LOG_DEBUG("Creating a random rotation matrix."); - double dot, norm; - double* matrix = (double*)malloc(sizeof(double) * nRows * nCols); - memset(matrix, 0, sizeof(double) * nRows * nCols); - for (uint32_t i = 0; i < nRows * nCols; i++) { - matrix[i] = _cuann_rand() - 0.5; - } - for (uint32_t j = 0; j < nCols; j++) { - // normalize the j-th col vector - norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); - for (uint32_t i = 0; i < nRows; i++) { - matrix[j + (nCols * i)] /= norm; - } - // orthogonalize the j-th col vector with the previous col vectors - for (uint32_t k = 0; k < j; k++) { - dot = _cuann_dot(nRows, matrix + j, nCols, matrix + k, nCols); - for (uint32_t i = 0; i < nRows; i++) { - matrix[j + (nCols * i)] -= dot * matrix[k + (nCols * i)]; - } - } - // normalize the j-th col vector again - norm = sqrt(_cuann_dot(nRows, matrix + j, nCols, matrix + j, nCols)); - for (uint32_t i = 0; i < nRows; i++) { - matrix[j + (nCols * i)] /= norm; - } - } - for (uint32_t i = 0; i < nRows * nCols; i++) { - rotationMatrix[i] = (float)matrix[i]; - } - free(matrix); - } else { - if (nRows == nCols) { - memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); - for (uint32_t i = 0; i < nCols; i++) { - rotationMatrix[i + (nCols * i)] = 1.0; - } - } else { - memset(rotationMatrix, 0, sizeof(float) * nRows * nCols); - uint32_t i = 0; - for (uint32_t j = 0; j < nCols; j++) { - rotationMatrix[j + (nCols * i)] = 1.0; - i += lenPq; - if (i >= nRows) { i = (i % nRows) + 1; } - } - } - } -} - -// show centers (for debuging) -inline void _cuann_kmeans_show_centers(const float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const uint32_t* centerSize, - const uint32_t numShow = 5) -{ -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - for (uint64_t k = 0; k < numCenters; k++) { - if ((numShow <= k) && (k < numCenters - numShow)) { - if (k == numShow) fprintf(stderr, "...\n"); - continue; - } - fprintf(stderr, "# centers[%lu]:", k); - for (uint64_t j = 0; j < dimCenters; j++) { - if ((numShow <= j) && (j < dimCenters - numShow)) { - if (j == numShow) fprintf(stderr, " ... "); - continue; - } - fprintf(stderr, " %f,", centers[j + (dimCenters * k)]); - } - fprintf(stderr, " %d\n", centerSize[k]); - } -#endif -} - -// show dataset (for debugging) -inline void _cuann_show_dataset(const float* dataset, // [numDataset, dimDataset] - uint32_t numDataset, - uint32_t dimDataset, - const uint32_t numShow = 5) -{ -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - for (uint64_t i = 0; i < numDataset; i++) { - if ((numShow <= i) && (i < numDataset - numShow)) { - if (i == numShow) fprintf(stderr, "...\n"); - continue; - } - fprintf(stderr, "# dataset[%lu]:", i); - for (uint64_t j = 0; j < dimDataset; j++) { - if ((numShow <= j) && (j < dimDataset - numShow)) { - if (j == numShow) fprintf(stderr, " ... "); - continue; - } - fprintf(stderr, " %.3f,", dataset[j + (dimDataset * i)]); - } - fprintf(stderr, "\n"); - } -#endif -} - -// show pq code (for debuging) -inline void _cuann_show_pq_code(const uint8_t* pqDataset, // [numDataset, dimPq] - uint32_t numDataset, - uint32_t dimPq, - const uint32_t numShow = 5) -{ -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - for (uint64_t i = 0; i < numDataset; i++) { - if ((numShow <= i) && (i < numDataset - numShow)) { - if (i == numShow) fprintf(stderr, "...\n"); - continue; - } - fprintf(stderr, "# dataset[%lu]:", i); - for (uint64_t j = 0; j < dimPq; j++) { - if ((numShow <= j) && (j < dimPq - numShow)) { - if (j == numShow) fprintf(stderr, " ... "); - continue; - } - fprintf(stderr, " %u,", pqDataset[j + (dimPq * i)]); - } - fprintf(stderr, "\n"); - } -#endif -} - -// -int _cuann_set_device(int devId) -{ - int orgDevId; - RAFT_CUDA_TRY(cudaGetDevice(&orgDevId)); - RAFT_CUDA_TRY(cudaSetDevice(devId)); - return orgDevId; -} - -// -uint32_t _get_num_trainset(uint32_t clusterSize, uint32_t dimPq, uint32_t bitPq) -{ - return min(clusterSize * dimPq, 256 * max(1 << bitPq, dimPq)); -} - -// -template -void _cuann_compute_PQ_code(const handle_t& handle, - uint32_t numDataset, - uint32_t dimDataset, - uint32_t dimRotDataset, - uint32_t dimPq, - uint32_t lenPq, - uint32_t bitPq, - uint32_t numClusters, - cuannPqCenter_t typePqCenter, - uint32_t maxClusterSize, - float* clusterCenters, // [numClusters, dimDataset] - const float* rotationMatrix, // [dimRotDataset, dimDataset] - const T* dataset, // [numDataset] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterSize, // [numClusters] - const uint32_t* cluster_offsets, // [numClusters + 1] - float* pqCenters, // [...] - uint32_t numIterations, - uint8_t* pqDataset // [numDataset, dimPq * bitPq / 8] -) -{ - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { - RAFT_LOG_DEBUG("_cuann_compute_PQ_code: using pool memory resource with initial size %zu bytes", - pool_guard->pool_size()); - } - // - // Compute PQ code - // - memset(pqDataset, 0, sizeof(uint8_t) * numDataset * dimPq * bitPq / 8); - float** resVectors; // [numDevices][maxClusterSize, dimDataset] - float** rotVectors; // [numDevices][maxClusterSize, dimRotDataset] - float** subVectors; // [numDevices][dimPq, maxClusterSize, lenPq] - uint32_t** subVectorLabels; // [numDevices][dimPq, maxClusterSize] - uint8_t** myPqDataset; // [numDevices][maxCluserSize, dimPq * bitPq / 8] - resVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimDataset, "resVectors"); - rotVectors = _cuann_multi_device_malloc(1, maxClusterSize * dimRotDataset, "rotVectors"); - subVectors = _cuann_multi_device_malloc(1, dimPq * maxClusterSize * lenPq, "subVectors"); - subVectorLabels = - _cuann_multi_device_malloc(1, dimPq * maxClusterSize, "subVectorLabels"); - myPqDataset = - _cuann_multi_device_malloc(1, maxClusterSize * dimPq * bitPq / 8, "myPqDataset"); - - uint32_t maxTrainset = 0; - if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { - maxTrainset = _get_num_trainset(maxClusterSize, dimPq, bitPq); - } - void** pqPredictWorkspace = (void**)_cuann_multi_device_malloc( - 1, - _cuann_kmeans_predict_bufferSize((1 << bitPq), lenPq, max(maxClusterSize, maxTrainset)), - "pqPredictWorkspace"); - - uint32_t** rotVectorLabels; // [numDevices][maxClusterSize, dimPq,] - uint32_t** pqClusterSize; // [numDevices][1 << bitPq,] - uint32_t** wsKAC; // [numDevices][1] - float** myPqCenters; // [numDevices][1 << bitPq, lenPq] - float** myPqCentersTemp; // [numDevices][1 << bitPq, lenPq] - if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { - memset(pqCenters, 0, sizeof(float) * numClusters * (1 << bitPq) * lenPq); - rotVectorLabels = - _cuann_multi_device_malloc(1, maxClusterSize * dimPq, "rotVectorLabels"); - pqClusterSize = _cuann_multi_device_malloc(1, (1 << bitPq), "pqClusterSize"); - wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); - myPqCenters = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCenters"); - myPqCentersTemp = _cuann_multi_device_malloc(1, (1 << bitPq) * lenPq, "myPqCentersTemp"); - } - -#pragma omp parallel for schedule(dynamic) num_threads(1) - for (uint32_t l = 0; l < numClusters; l++) { - int devId = omp_get_thread_num(); - RAFT_CUDA_TRY(cudaSetDevice(devId)); - if (devId == 0) { - fprintf(stderr, "(%s) Making PQ dataset: %u / %u \r", __func__, l, numClusters); - } - if (clusterSize[l] == 0) continue; - - // - // Compute the residual vector of the new vector with its cluster - // centroids. - // resVectors[..] = newVectors[..] - clusterCenters[..] - // - detail::utils::copy_selected(clusterSize[l], - dimDataset, - dataset, - originalNumbers + cluster_offsets[l], - dimDataset, - resVectors[devId], - dimDataset, - handle.get_stream()); - _cuann_a_me_b(clusterSize[l], - dimDataset, - resVectors[devId], - dimDataset, - clusterCenters + (uint64_t)l * dimDataset); - - // - // Rotate the residual vectors using a rotation matrix - // - float alpha = 1.0; - float beta = 0.0; - linalg::gemm(handle, - true, - false, - dimRotDataset, - clusterSize[l], - dimDataset, - &alpha, - rotationMatrix, - dimDataset, - resVectors[devId], - dimDataset, - &beta, - rotVectors[devId], - dimRotDataset, - handle.get_stream()); - - // - // Training PQ codebook if CUANN_PQ_CENTER_PER_CLUSTER - // (*) PQ codebooks are trained for each cluster. - // - if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { - uint32_t numTrainset = _get_num_trainset(clusterSize[l], dimPq, bitPq); - int numIterations_2 = numIterations * 2; - for (int iter = 0; iter < numIterations_2; iter += 2) { - if (devId == 0) { - fprintf(stderr, - "(%s) Making PQ dataset: %u / %u, " - "Training PQ codebook (%u): %.1f / %u \r", - __func__, - l, - numClusters, - numTrainset, - (float)iter / 2, - numIterations); - } - _cuann_kmeans_predict(handle, - myPqCenters[devId], - (1 << bitPq), - lenPq, - rotVectors[devId], - CUDA_R_32F, - numTrainset, - rotVectorLabels[devId], - raft::distance::DistanceType::L2Expanded, - (iter != 0), - pqPredictWorkspace[devId], - myPqCentersTemp[devId], - pqClusterSize[devId], - true); - if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(myPqCenters[devId], - (1 << bitPq), - lenPq, - rotVectors[devId], - numTrainset, - rotVectorLabels[devId], - pqClusterSize[devId], - (float)1.0 / 4, - device_memory, - handle.get_stream())) { - iter -= 1; - } - } - RAFT_CUDA_TRY(cudaMemcpy(pqCenters + ((1 << bitPq) * lenPq) * l, - myPqCenters[devId], - sizeof(float) * (1 << bitPq) * lenPq, - cudaMemcpyDeviceToHost)); - } - - // - // Change the order of the vector data to facilitate processing in - // each vector subspace. - // input: rotVectors[clusterSize, dimRotDataset] - // output: subVectors[dimPq, clusterSize, lenPq] - // - _cuann_transpose_copy_3d(lenPq, - clusterSize[l], - dimPq, - subVectors[devId], - lenPq, - clusterSize[l], - rotVectors[devId], - 1, - dimRotDataset, - lenPq); - - // - // Find a label (cluster ID) for each vector subspace. - // - for (uint32_t j = 0; j < dimPq; j++) { - float* curPqCenters = NULL; - if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * j; - } else if (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER) { - curPqCenters = pqCenters + ((1 << bitPq) * lenPq) * l; - if (numIterations > 0) { curPqCenters = myPqCenters[devId]; } - } - _cuann_kmeans_predict(handle, - curPqCenters, - (1 << bitPq), - lenPq, - subVectors[devId] + j * (clusterSize[l] * lenPq), - CUDA_R_32F, - clusterSize[l], - subVectorLabels[devId] + j * clusterSize[l], - raft::distance::DistanceType::L2Expanded, - true, - pqPredictWorkspace[devId], - nullptr, - nullptr, - true); - } - - // - // PQ encoding - // - ivfpq_encode( - clusterSize[l], clusterSize[l], dimPq, bitPq, subVectorLabels[devId], myPqDataset[devId]); - RAFT_CUDA_TRY(cudaMemcpy(pqDataset + ((uint64_t)cluster_offsets[l] * dimPq * bitPq / 8), - myPqDataset[devId], - sizeof(uint8_t) * clusterSize[l] * dimPq * bitPq / 8, - cudaMemcpyDeviceToHost)); - } - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - fprintf(stderr, "\n"); - - // - _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); - _cuann_multi_device_free(myPqDataset, 1); - _cuann_multi_device_free(subVectorLabels, 1); - _cuann_multi_device_free(subVectors, 1); - _cuann_multi_device_free(rotVectors, 1); - _cuann_multi_device_free(resVectors, 1); - if ((numIterations > 0) && (typePqCenter == CUANN_PQ_CENTER_PER_CLUSTER)) { - _cuann_multi_device_free(wsKAC, 1); - _cuann_multi_device_free(rotVectorLabels, 1); - _cuann_multi_device_free(pqClusterSize, 1); - _cuann_multi_device_free(myPqCenters, 1); - _cuann_multi_device_free(myPqCentersTemp, 1); - } -} - -// cuannIvfPqSetIndexParameters -inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, - const uint32_t numClusters, - const uint32_t numDataset, - const uint32_t dimDataset, - const uint32_t dimPq, - const uint32_t bitPq, - const distance::DistanceType metric, - const cuannPqCenter_t typePqCenter) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - RAFT_EXPECTS(numClusters > 0, "(%s) numClusters must be larger than zero.", __func__); - RAFT_EXPECTS(numDataset > 0, "(%s) numDataset must be larger than zero.", __func__); - RAFT_EXPECTS(dimDataset > 0, "(%s) dimDataset must be larger than zero.", __func__); - RAFT_EXPECTS(dimPq > 0, "(%s) dimPq must be larger than zero.", __func__); - RAFT_EXPECTS(numClusters <= numDataset, - "(%s) numClusters must be smaller than numDataset (numClusters:%u, numDataset:%u).", - __func__, - numClusters, - numDataset); - RAFT_EXPECTS(bitPq >= 4 && bitPq <= 8, - "(%s) bitPq must be within closed range [4,8], but got %u.", - __func__, - bitPq); - RAFT_EXPECTS((bitPq * dimPq) % 8 == 0, - "(%s) `bitPq * dimPq` must be a multiple of 8, but got %u * %u = %u.", - __func__, - bitPq, - dimPq, - bitPq * dimPq); - desc->numClusters = numClusters; - desc->numDataset = numDataset; - desc->dimDataset = dimDataset; - desc->dimDatasetExt = dimDataset + 1; - if (desc->dimDatasetExt % 8) { desc->dimDatasetExt += 8 - (desc->dimDatasetExt % 8); } - RAFT_EXPECTS(desc->dimDatasetExt >= dimDataset + 1, "unexpected dimDatasetExt"); - RAFT_EXPECTS(desc->dimDatasetExt % 8 == 0, "unexpected dimDatasetExt"); - desc->dimPq = dimPq; - desc->bitPq = bitPq; - desc->metric = metric; - desc->typePqCenter = typePqCenter; - - desc->dimRotDataset = dimDataset; - if (dimDataset % dimPq) { desc->dimRotDataset = ((dimDataset / dimPq) + 1) * dimPq; } - desc->lenPq = desc->dimRotDataset / dimPq; -} - -// cuannIvfPqGetIndexParameters -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - distance::DistanceType* metric, - cuannPqCenter_t* typePqCenter) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - - *numClusters = desc->numClusters; - *numDataset = desc->numDataset; - *dimDataset = desc->dimDataset; - *dimPq = desc->dimPq; - *bitPq = desc->bitPq; - *metric = desc->metric; - *typePqCenter = desc->typePqCenter; -} - -// cuannIvfPqGetIndexSize -inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - - *size = sizeof(struct cuannIvfPqIndexHeader); - RAFT_EXPECTS(*size == 1024, "Critical error: unexpected header size."); - *size += _cuann_getIndexSize_clusterCenters(desc); - *size += _cuann_getIndexSize_pqCenters(desc); - *size += _cuann_getIndexSize_pqDataset(desc); - *size += _cuann_getIndexSize_originalNumbers(desc); - *size += _cuann_getIndexSize_indexPtr(desc); - *size += _cuann_getIndexSize_rotationMatrix(desc); - *size += _cuann_getIndexSize_clusterRotCenters(desc); -} - -template -void cuannIvfPqBuildIndex( - const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const T* dataset, /* [numDataset, dimDataset] */ - const T* trainset, /* [numTrainset, dimDataset] */ - uint32_t numTrainset, /* Number of train-set entries */ - uint32_t numIterations, /* Number of iterations to train kmeans */ - bool randomRotation, /* If true, rotate vectors with randamly created rotation matrix */ - bool hierarchicalClustering /* If true, do kmeans training hierarchically */) -{ - int cuannDevId = handle.get_device(); - int callerDevId = _cuann_set_device(cuannDevId); - - cudaDataType_t dtype; - if constexpr (std::is_same_v) { - dtype = CUDA_R_32F; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_8U; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_8I; - } else { - static_assert( - std::is_same_v || std::is_same_v || std::is_same_v, - "unsupported type"); - } - if (desc->metric == distance::DistanceType::InnerProduct) { - RAFT_EXPECTS(dtype == CUDA_R_32F, - "Unsupported dtype (inner-product metric support float only)"); - } - - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { - RAFT_LOG_DEBUG("cuannIvfPqBuildIndex: using pool memory resource with initial size %zu bytes", - pool_guard->pool_size()); - } - - desc->dtypeDataset = dtype; - char dtypeString[64]; - _cuann_get_dtype_string(desc->dtypeDataset, dtypeString); - RAFT_LOG_DEBUG("Dataset dtype = %s", dtypeString); - - switch (detail::utils::check_pointer_residency(dataset, trainset)) { - case detail::utils::pointer_residency::host_only: - case detail::utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("both dataset and trainsed must be accessible from the host."); - } - - if (desc->index_ptr != NULL) { RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); } - size_t index_size; - ivf_pq::cuannIvfPqGetIndexSize(desc, &index_size); - RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), index_size)); - - struct cuannIvfPqIndexHeader* header; - float* clusterCenters; // [numClusters, dimDataset] - float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* originalNumbers; // [numDataset] - uint32_t* cluster_offsets; // [numClusters + 1] - float* rotationMatrix; // [dimDataset, dimRotDataset] - float* clusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(desc, - &header, - &clusterCenters, - &pqCenters, - &pqDataset, - &originalNumbers, - &cluster_offsets, - &rotationMatrix, - &clusterRotCenters); - - uint32_t* trainsetLabels; // [numTrainset] - RAFT_CUDA_TRY(cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset)); - - uint32_t* clusterSize; // [numClusters] - RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * desc->numClusters)); - - float* clusterCentersTemp; // [numClusters, dimDataset] - RAFT_CUDA_TRY( - cudaMallocManaged(&clusterCentersTemp, sizeof(float) * desc->numClusters * desc->dimDataset)); - - uint32_t** wsKAC = _cuann_multi_device_malloc(1, 1, "wsKAC"); - - // - // Training kmeans - // - if (hierarchicalClustering) { - RAFT_LOG_DEBUG("Hierarchical clustering: enabled"); - } else { - RAFT_LOG_DEBUG("Hierarchical clustering: disabled"); - } - if (hierarchicalClustering) { - // Hierarchical kmeans - uint32_t numMesoClusters = pow((double)(desc->numClusters), (double)1.0 / 2.0) + 0.5; - RAFT_LOG_DEBUG("numMesoClusters: %u", numMesoClusters); - - float* mesoClusterCenters; // [numMesoClusters, dimDataset] - RAFT_CUDA_TRY( - cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * desc->dimDataset)); - - float* mesoClusterCentersTemp; // [numMesoClusters, dimDataset] - RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterCentersTemp, - sizeof(float) * numMesoClusters * desc->dimDataset)); - - uint32_t* mesoClusterLabels; // [numTrainset,] - RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset)); - - uint32_t* mesoClusterSize; // [numMesoClusters,] - RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters)); - - // - // Training kmeans for meso-clusters - // - int numIterations_2 = numIterations * 2; - for (int iter = 0; iter < numIterations_2; iter += 2) { - _cuann_kmeans_predict(handle, - mesoClusterCenters, - numMesoClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - mesoClusterLabels, - desc->metric, - (iter != 0), - NULL, - mesoClusterCentersTemp, - mesoClusterSize, - true); - if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(mesoClusterCenters, - numMesoClusters, - desc->dimDataset, - trainset, - numTrainset, - mesoClusterLabels, - mesoClusterSize, - (float)1.0 / 4, - device_memory, - handle.get_stream())) { - iter -= 1; - if (desc->metric == distance::DistanceType::InnerProduct) { - detail::utils::normalize_rows( - numMesoClusters, desc->dimDataset, mesoClusterCenters, handle.get_stream()); - } - } - } - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - - // Number of centers in each meso cluster - // [numMesoClusters,] - uint32_t* numFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * numMesoClusters); - - // [numMesoClusters + 1,] - uint32_t* csumFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * (numMesoClusters + 1)); - csumFineClusters[0] = 0; - - uint32_t numClustersRemain = desc->numClusters; - uint32_t numTrainsetRemain = numTrainset; - uint32_t mesoClusterSizeSum = 0; // check - uint32_t mesoClusterSizeMax = 0; - uint32_t numFineClustersMax = 0; - for (uint32_t i = 0; i < numMesoClusters; i++) { - if (i < numMesoClusters - 1) { - numFineClusters[i] = - (double)numClustersRemain * mesoClusterSize[i] / numTrainsetRemain + .5; - } else { - numFineClusters[i] = numClustersRemain; - } - csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i]; - - numClustersRemain -= numFineClusters[i]; - numTrainsetRemain -= mesoClusterSize[i]; - mesoClusterSizeSum += mesoClusterSize[i]; - mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]); - numFineClustersMax = max(numFineClustersMax, numFineClusters[i]); - } - RAFT_EXPECTS(mesoClusterSizeSum == numTrainset, "mesocluster sizes do not add up"); - RAFT_EXPECTS(csumFineClusters[numMesoClusters] == desc->numClusters, - "fine cluster sizes do not add up"); - - uint32_t** idsTrainset = - _cuann_multi_device_malloc(1, mesoClusterSizeMax, "idsTrainset"); - - float** subTrainset = - _cuann_multi_device_malloc(1, mesoClusterSizeMax * desc->dimDataset, "subTrainset"); - - // label (cluster ID) of each vector - uint32_t** labelsMP = _cuann_multi_device_malloc(1, mesoClusterSizeMax, "labelsMP"); - - float** clusterCentersEach = _cuann_multi_device_malloc( - 1, numFineClustersMax * desc->dimDataset, "clusterCentersEach"); - - float** clusterCentersMP = _cuann_multi_device_malloc( - 1, numFineClustersMax * desc->dimDataset, "clusterCentersMP"); - - // number of vectors in each cluster - uint32_t** clusterSizeMP = - _cuann_multi_device_malloc(1, numFineClustersMax, "clusterSizeMP"); - - size_t sizePredictWorkspace = 0; - for (uint32_t i = 0; i < numMesoClusters; i++) { - sizePredictWorkspace = - max(sizePredictWorkspace, - _cuann_kmeans_predict_bufferSize(numFineClusters[i], // number of centers - desc->dimDataset, - mesoClusterSize[i] // number of vectors - )); - } - void** predictWorkspace = - (void**)_cuann_multi_device_malloc(1, sizePredictWorkspace, "predictWorkspace"); - - // - // Training kmeans for clusters in each meso-clusters - // -#pragma omp parallel for schedule(dynamic) num_threads(1) - for (uint32_t i = 0; i < numMesoClusters; i++) { - int devId = omp_get_thread_num(); - RAFT_CUDA_TRY(cudaSetDevice(devId)); - - uint32_t k = 0; - for (uint32_t j = 0; j < numTrainset; j++) { - if (mesoClusterLabels[j] != i) continue; - idsTrainset[devId][k++] = j; - } - RAFT_EXPECTS(k == mesoClusterSize[i], "unexpected cluster size for cluster %u", i); - - detail::utils::copy_selected(mesoClusterSize[i], - desc->dimDataset, - trainset, - idsTrainset[devId], - desc->dimDataset, - subTrainset[devId], - desc->dimDataset, - handle.get_stream()); - - int numIterations_2 = numIterations * 2; - for (int iter = 0; iter < numIterations_2; iter += 2) { - _cuann_kmeans_predict(handle, - clusterCentersEach[devId], - numFineClusters[i], - desc->dimDataset, - subTrainset[devId], - CUDA_R_32F, - mesoClusterSize[i], - labelsMP[devId], - desc->metric, - (iter != 0), - predictWorkspace[devId], - clusterCentersMP[devId], - clusterSizeMP[devId], - true); - if ((iter + 1 < numIterations_2) && - detail::kmeans::adjust_centers(clusterCentersEach[devId], - numFineClusters[i], - desc->dimDataset, - subTrainset[devId], - mesoClusterSize[i], - labelsMP[devId], - clusterSizeMP[devId], - (float)1.0 / 4, - device_memory, - handle.get_stream())) { - iter -= 1; - if (desc->metric == distance::DistanceType::InnerProduct) { - detail::utils::normalize_rows( - numFineClusters[i], desc->dimDataset, clusterCentersEach[devId], handle.get_stream()); - } - } - } - RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (desc->dimDataset * csumFineClusters[i]), - clusterCentersEach[devId], - sizeof(float) * numFineClusters[i] * desc->dimDataset, - cudaMemcpyDeviceToDevice)); - } - for (int devId = 0; devId < 1; devId++) { - RAFT_CUDA_TRY(cudaSetDevice(devId)); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - } - RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); - - _cuann_multi_device_free(idsTrainset, 1); - _cuann_multi_device_free(subTrainset, 1); - _cuann_multi_device_free(labelsMP, 1); - _cuann_multi_device_free(clusterCentersEach, 1); - _cuann_multi_device_free(clusterCentersMP, 1); - _cuann_multi_device_free(clusterSizeMP, 1); - _cuann_multi_device_free((uint8_t**)predictWorkspace, 1); - - RAFT_CUDA_TRY(cudaFree(mesoClusterSize)); - RAFT_CUDA_TRY(cudaFree(mesoClusterLabels)); - RAFT_CUDA_TRY(cudaFree(mesoClusterCenters)); - RAFT_CUDA_TRY(cudaFree(mesoClusterCentersTemp)); - - free(numFineClusters); - free(csumFineClusters); - - // - // Fine-tuning kmeans for whole clusters (with multipel GPUs) - // - // (*) Since the likely cluster centroids have been calculated - // hierarchically already, the number of iteration for fine-tuning - // kmeans for whole clusters should be reduced. However, there - // is a possibility that the clusters could be unbalanced here, - // in which case the actual number of iterations would be increased. - // - const int X = 5; - int numIterations_X = max(numIterations / 10, 2) * X; - for (int iter = 0; iter < numIterations_X; iter += X) { - _cuann_kmeans_predict_MP(handle, - clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - trainsetLabels, - desc->metric, - true, - clusterSize, - true /* to update clusterCenters */); - if ((iter + 1 < numIterations_X) && detail::kmeans::adjust_centers(clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - numTrainset, - trainsetLabels, - clusterSize, - (float)1.0 / 5, - device_memory, - handle.get_stream())) { - iter -= (X - 1); - if (desc->metric == distance::DistanceType::InnerProduct) { - detail::utils::normalize_rows( - desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); - } - } - } - } else { - // Flat kmeans - int numIterations_2 = numIterations * 2; - for (int iter = 0; iter < numIterations_2; iter += 2) { - _cuann_kmeans_predict(handle, - clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - trainsetLabels, - desc->metric, - (iter != 0), - NULL, - clusterCentersTemp, - clusterSize, - true); - if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - numTrainset, - trainsetLabels, - clusterSize, - (float)1.0 / 4, - device_memory, - handle.get_stream())) { - iter -= 1; - if (desc->metric == distance::DistanceType::InnerProduct) { - detail::utils::normalize_rows( - desc->numClusters, desc->dimDataset, clusterCenters, handle.get_stream()); - } - } - } - } - - uint32_t* datasetLabels; // [numDataset] - RAFT_CUDA_TRY(cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * desc->numDataset)); - - // - // Predict labels of whole dataset (with multiple GPUs) - // - _cuann_kmeans_predict_MP(handle, - clusterCenters, - desc->numClusters, - desc->dimDataset, - dataset, - dtype, - desc->numDataset, - datasetLabels, - desc->metric, - true, - clusterSize, - true /* to update clusterCenters */); - -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - _cuann_kmeans_show_centers(clusterCenters, desc->numClusters, desc->dimDataset, clusterSize); -#endif - - // Make rotation matrix - RAFT_LOG_DEBUG("# dimDataset: %u\n", desc->dimDataset); - RAFT_LOG_DEBUG("# dimRotDataset: %u\n", desc->dimRotDataset); - RAFT_LOG_DEBUG("# randomRotation: %s\n", randomRotation ? "enabled" : "disabled"); - _cuann_make_rotation_matrix( - desc->dimRotDataset, desc->dimDataset, desc->lenPq, randomRotation, rotationMatrix); - - // Rotate clusterCenters - float alpha = 1.0; - float beta = 0.0; - linalg::gemm(handle, - true, - false, - desc->dimRotDataset, - desc->numClusters, - desc->dimDataset, - &alpha, - rotationMatrix, - desc->dimDataset, - clusterCenters, - desc->dimDataset, - &beta, - clusterRotCenters, - desc->dimRotDataset, - handle.get_stream()); - - // - // Make cluster_offsets, originalNumbers and pqDataset - // - uint32_t maxClusterSize = 0; - // cluster_offsets - cluster_offsets[0] = 0; - for (uint32_t l = 0; l < desc->numClusters; l++) { - cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; - if (maxClusterSize < clusterSize[l]) { maxClusterSize = clusterSize[l]; } - } - RAFT_EXPECTS(cluster_offsets[desc->numClusters] == desc->numDataset, - "Cluster sizes do not add up"); - desc->maxClusterSize = maxClusterSize; - - // originalNumbers - for (uint32_t i = 0; i < desc->numDataset; i++) { - uint32_t l = datasetLabels[i]; - originalNumbers[cluster_offsets[l]] = i; - cluster_offsets[l] += 1; - } - - // Recover cluster_offsets - for (uint32_t l = 0; l < desc->numClusters; l++) { - cluster_offsets[l] -= clusterSize[l]; - } - - // [numDevices][1 << bitPq, lenPq] - float** pqCentersTemp = - _cuann_multi_device_malloc(1, (1 << desc->bitPq) * desc->lenPq, "pqCentersTemp"); - - // [numDevices][1 << bitPq,] - uint32_t** pqClusterSize = - _cuann_multi_device_malloc(1, (1 << desc->bitPq), "pqClusterSize"); - - // Allocate workspace for PQ codebook training - size_t sizePqPredictWorkspace = - _cuann_kmeans_predict_bufferSize((1 << desc->bitPq), desc->lenPq, numTrainset); - sizePqPredictWorkspace = max(sizePqPredictWorkspace, - _cuann_kmeans_predict_bufferSize( - (1 << desc->bitPq), desc->lenPq, maxClusterSize * desc->dimPq)); - void** pqPredictWorkspace = - (void**)_cuann_multi_device_malloc(1, sizePqPredictWorkspace, "pqPredictWorkspace"); - - if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - // - // Training PQ codebook (CUANN_PQ_CENTER_PER_SUBSPACE) - // (*) PQ codebooks are trained for each subspace. - // - - // Predict label of trainset again (with multiple GPUs) - _cuann_kmeans_predict_MP(handle, - clusterCenters, - desc->numClusters, - desc->dimDataset, - trainset, - dtype, - numTrainset, - trainsetLabels, - desc->metric, - true, - NULL, - false /* do not update clusterCenters */); - - // [dimPq, numTrainset, lenPq] - size_t sizeModTrainset = sizeof(float) * desc->dimPq * numTrainset * desc->lenPq; - float* modTrainset = (float*)malloc(sizeModTrainset); - memset(modTrainset, 0, sizeModTrainset); - - // modTrainset[] = transpose( rotate(trainset[]) - clusterRotCenters[] ) -#pragma omp parallel for - for (uint32_t i = 0; i < numTrainset; i++) { - uint32_t l = trainsetLabels[i]; - for (uint32_t j = 0; j < desc->dimRotDataset; j++) { - float val; - if (dtype == CUDA_R_32F) { - val = - _cuann_dot(desc->dimDataset, - (float*)trainset + ((uint64_t)(desc->dimDataset) * i), - 1, - rotationMatrix + ((uint64_t)(desc->dimDataset) * j), - 1); - } else if (dtype == CUDA_R_8U) { - float divisor = 256.0; - val = _cuann_dot( - desc->dimDataset, - (uint8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), - 1, - rotationMatrix + ((uint64_t)(desc->dimDataset) * j), - 1, - divisor); - } else if (dtype == CUDA_R_8I) { - float divisor = 128.0; - val = - _cuann_dot(desc->dimDataset, - (int8_t*)trainset + ((uint64_t)(desc->dimDataset) * i), - 1, - rotationMatrix + ((uint64_t)(desc->dimDataset) * j), - 1, - divisor); - } - uint32_t j0 = j / (desc->lenPq); // 0 <= j0 < dimPq - uint32_t j1 = j % (desc->lenPq); // 0 <= j1 < lenPq - uint64_t idx = - j1 + ((uint64_t)(desc->lenPq) * i) + ((uint64_t)(desc->lenPq) * numTrainset * j0); - modTrainset[idx] = val - clusterRotCenters[j + (desc->dimRotDataset * l)]; - } - } - - // [numDevices][numTrainset, lenPq] - float** subTrainset = - _cuann_multi_device_malloc(1, numTrainset * desc->lenPq, "subTrainset"); - - // [numDevices][numTrainset] - uint32_t** subTrainsetLabels = - _cuann_multi_device_malloc(1, numTrainset, "subTrainsetLabels"); - - float** pqCentersEach = - _cuann_multi_device_malloc(1, ((1 << desc->bitPq) * desc->lenPq), "pqCentersEach"); - -#pragma omp parallel for schedule(dynamic) num_threads(1) - for (uint32_t j = 0; j < desc->dimPq; j++) { - int devId = omp_get_thread_num(); - RAFT_CUDA_TRY(cudaSetDevice(devId)); - - float* curPqCenters = pqCenters + ((1 << desc->bitPq) * desc->lenPq) * j; - RAFT_CUDA_TRY(cudaMemcpy(subTrainset[devId], - modTrainset + ((uint64_t)numTrainset * desc->lenPq * j), - sizeof(float) * numTrainset * desc->lenPq, - cudaMemcpyHostToDevice)); - // Train kmeans for each PQ - int numIterations_2 = numIterations * 2; - for (int iter = 0; iter < numIterations_2; iter += 2) { - if (devId == 0) { - fprintf(stderr, - "(%s) Training PQ codebook %u (out of %u): " - "%.1f / %u \r", - __func__, - j, - desc->dimPq, - (float)iter / 2, - numIterations); - } - _cuann_kmeans_predict(handle, - pqCentersEach[devId], - (1 << desc->bitPq), - desc->lenPq, - subTrainset[devId], - CUDA_R_32F, - numTrainset, - subTrainsetLabels[devId], - raft::distance::DistanceType::L2Expanded, - (iter != 0), - pqPredictWorkspace[devId], - pqCentersTemp[devId], - pqClusterSize[devId], - true); - if ((iter + 1 < numIterations_2) && detail::kmeans::adjust_centers(pqCentersEach[devId], - (1 << desc->bitPq), - desc->lenPq, - subTrainset[devId], - numTrainset, - subTrainsetLabels[devId], - pqClusterSize[devId], - (float)1.0 / 4, - device_memory, - handle.get_stream())) { - iter -= 1; - } - } - RAFT_CUDA_TRY(cudaMemcpy(curPqCenters, - pqCentersEach[devId], - sizeof(float) * ((1 << desc->bitPq) * desc->lenPq), - cudaMemcpyDeviceToDevice)); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - if (j == 0) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - _cuann_kmeans_show_centers( - curPqCenters, (1 << desc->bitPq), desc->lenPq, pqClusterSize[devId]); - } -#endif - } - fprintf(stderr, "\n"); - RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); - - _cuann_multi_device_free(subTrainset, 1); - _cuann_multi_device_free(subTrainsetLabels, 1); - _cuann_multi_device_free(pqCentersEach, 1); - free(modTrainset); - } - - // - // Compute PQ code for whole dataset - // - _cuann_compute_PQ_code(handle, - desc->numDataset, - desc->dimDataset, - desc->dimRotDataset, - desc->dimPq, - desc->lenPq, - desc->bitPq, - desc->numClusters, - desc->typePqCenter, - maxClusterSize, - clusterCenters, - rotationMatrix, - dataset, - originalNumbers, - clusterSize, - cluster_offsets, - pqCenters, - numIterations, - pqDataset); - RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); - - // - _cuann_get_inclusiveSumSortedClusterSize( - desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); - _cuann_get_sqsumClusters(desc, clusterCenters, &(desc->sqsumClusters)); - - { - // combine clusterCenters and sqsumClusters - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - float* tmpClusterCenters; // [numClusters, dimDataset] - RAFT_CUDA_TRY( - cudaMallocManaged(&tmpClusterCenters, sizeof(float) * desc->numClusters * desc->dimDataset)); - for (uint32_t i = 0; i < desc->numClusters * desc->dimDataset; i++) { - tmpClusterCenters[i] = clusterCenters[i]; - } - for (uint32_t i = 0; i < desc->numClusters; i++) { - for (uint32_t j = 0; j < desc->dimDataset; j++) { - clusterCenters[j + (desc->dimDatasetExt * i)] = - tmpClusterCenters[j + (desc->dimDataset * i)]; - } - clusterCenters[desc->dimDataset + (desc->dimDatasetExt * i)] = desc->sqsumClusters[i]; - } - RAFT_CUDA_TRY(cudaFree(tmpClusterCenters)); - } - - // - cuannIvfPqGetIndexSize(desc, &(header->indexSize)); - header->metric = desc->metric; - header->numClusters = desc->numClusters; - header->numDataset = desc->numDataset; - header->dimDataset = desc->dimDataset; - header->dimPq = desc->dimPq; - header->maxClusterSize = maxClusterSize; - header->dimRotDataset = desc->dimRotDataset; - header->bitPq = desc->bitPq; - header->typePqCenter = desc->typePqCenter; - header->dtypeDataset = desc->dtypeDataset; - header->dimDatasetExt = desc->dimDatasetExt; - header->numDatasetAdded = 0; - - // - RAFT_CUDA_TRY(cudaFree(clusterSize)); - RAFT_CUDA_TRY(cudaFree(trainsetLabels)); - RAFT_CUDA_TRY(cudaFree(datasetLabels)); - RAFT_CUDA_TRY(cudaFree(clusterCentersTemp)); - - _cuann_multi_device_free(wsKAC, 1); - _cuann_multi_device_free(pqCentersTemp, 1); - _cuann_multi_device_free(pqClusterSize, 1); - _cuann_multi_device_free((uint8_t**)pqPredictWorkspace, 1); - - _cuann_set_device(callerDevId); -} - -// cuannIvfPqSaveIndex -inline void cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const char* fileName) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - int orgDevId = _cuann_set_device(handle.get_device()); - - FILE* fp = fopen(fileName, "w"); - RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); - - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); - RAFT_LOG_DEBUG("indexSize: %lu\n", header->indexSize); - if (fwrite(desc->index_ptr, 1, header->indexSize, fp) != header->indexSize) { - RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); - } - fclose(fp); - - _cuann_set_device(orgDevId); -} - -// cuannIvfPqLoadIndex -inline void cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const char* fileName) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - int orgDevId = _cuann_set_device(handle.get_device()); - - if (1 /* *index == NULL */) { - FILE* fp = fopen(fileName, "r"); - RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); - - if (desc->index_ptr != NULL) { RAFT_CUDA_TRY(cudaFree(desc->index_ptr)); } - size_t indexSize; - fread(&indexSize, sizeof(size_t), 1, fp); - RAFT_LOG_DEBUG("indexSize: %lu\n", indexSize); - RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), indexSize)); - fseek(fp, 0, SEEK_SET); - if (fread(desc->index_ptr, 1, indexSize, fp) != indexSize) { - RAFT_FAIL("(%s) failed to load index to from file (%s)\n", __func__, fileName); - } - fclose(fp); - - RAFT_CUDA_TRY( - cudaMemAdvise(desc->index_ptr, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); - } - - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); - desc->numClusters = header->numClusters; - desc->numDataset = header->numDataset; - desc->dimDataset = header->dimDataset; - desc->dimPq = header->dimPq; - desc->metric = (distance::DistanceType)header->metric; - desc->maxClusterSize = header->maxClusterSize; - desc->dimRotDataset = header->dimRotDataset; - desc->lenPq = desc->dimRotDataset / desc->dimPq; - desc->bitPq = header->bitPq; - desc->typePqCenter = (cuannPqCenter_t)header->typePqCenter; - desc->dtypeDataset = (cudaDataType_t)header->dtypeDataset; - desc->dimDatasetExt = header->dimDatasetExt; - desc->indexVersion = header->version; - - float* clusterCenters; // [numClusters, dimDatasetExt] - float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* originalNumbers; // [numDataset] - uint32_t* cluster_offsets; // [numClusters + 1] - float* rotationMatrix; // [dimDataset, dimRotDataset] - float* clusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(desc, - &header, - &clusterCenters, - &pqCenters, - &pqDataset, - &originalNumbers, - &cluster_offsets, - &rotationMatrix, - &clusterRotCenters); - - // - _cuann_get_inclusiveSumSortedClusterSize( - desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); - - size_t size; - // pqDataset - size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; - if (size < handle.get_device_properties().totalGlobalMem) { - RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqDataset, size, handle.get_device())); - } - // clusterCenters - size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterCenters, size, handle.get_device())); - // pqCenters - if (desc->typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; - } else { - size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; - } - RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqCenters, size, handle.get_device())); - // originalNumbers - size = sizeof(uint32_t) * desc->numDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(originalNumbers, size, handle.get_device())); - // cluster_offsets - size = sizeof(uint32_t) * (desc->numClusters + 1); - RAFT_CUDA_TRY(cudaMemPrefetchAsync(cluster_offsets, size, handle.get_device())); - // rotationMatrix - if (rotationMatrix != NULL) { - size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device())); - } - // clusterRotCenters - if (clusterRotCenters != NULL) { - size = sizeof(float) * desc->numClusters * desc->dimRotDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device())); - } - - _cuann_set_device(orgDevId); -} - -template -auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( - const handle_t& handle, - cuannIvfPqDescriptor_t& oldDesc, - const T* newVectors, /* [numNewVectors, dimDataset] */ - uint32_t numNewVectors) -> cuannIvfPqDescriptor_t -{ - switch (detail::utils::check_pointer_residency(newVectors)) { - case detail::utils::pointer_residency::host_only: - case detail::utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("newVectors must be accessible from the host."); - } - int cuannDevId = handle.get_device(); - int callerDevId = _cuann_set_device(cuannDevId); - - cudaDataType_t dtype = oldDesc->dtypeDataset; - if constexpr (std::is_same_v) { - RAFT_EXPECTS( - dtype == CUDA_R_32F, - "The old index type (%d) doesn't much CUDA_R_32F required by the template instantiation", - dtype); - } else if constexpr (std::is_same_v) { - RAFT_EXPECTS( - dtype == CUDA_R_8U, - "The old index type (%d) doesn't much CUDA_R_8U required by the template instantiation", - dtype); - } else if constexpr (std::is_same_v) { - RAFT_EXPECTS( - dtype == CUDA_R_8I, - "The old index type (%d) doesn't much CUDA_R_8I required by the template instantiation", - dtype); - } else { - static_assert( - std::is_same_v || std::is_same_v || std::is_same_v, - "unsupported type"); - } - - char dtypeString[64]; - _cuann_get_dtype_string(dtype, dtypeString); - RAFT_LOG_DEBUG("dtype: %s", dtypeString); - RAFT_LOG_DEBUG("dimDataset: %u", oldDesc->dimDataset); - struct cuannIvfPqIndexHeader* oldHeader; - float* oldClusterCenters; // [numClusters, dimDatasetExt] - float* oldPqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* oldPqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* oldOriginalNumbers; // [numDataset] - uint32_t* old_cluster_offsets; // [numClusters + 1] - float* oldRotationMatrix; // [dimDataset, dimRotDataset] - float* oldClusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(oldDesc, - &oldHeader, - &oldClusterCenters, - &oldPqCenters, - &oldPqDataset, - &oldOriginalNumbers, - &old_cluster_offsets, - &oldRotationMatrix, - &oldClusterRotCenters); - - // - // The clusterCenters stored in index contain data other than cluster - // centroids to speed up the search. Here, only the cluster centroids - // are extracted. - // - float* clusterCenters; // [numClusters, dimDataset] - RAFT_CUDA_TRY( - cudaMallocManaged(&clusterCenters, sizeof(float) * oldDesc->numClusters * oldDesc->dimDataset)); - for (uint32_t i = 0; i < oldDesc->numClusters; i++) { - memcpy(clusterCenters + (uint64_t)i * oldDesc->dimDataset, - oldClusterCenters + (uint64_t)i * oldDesc->dimDatasetExt, - sizeof(float) * oldDesc->dimDataset); - } - - // - // Use the existing cluster centroids to find the label (cluster ID) - // of the vector to be added. - // - uint32_t* newVectorLabels; // [numNewVectors,] - RAFT_CUDA_TRY(cudaMallocManaged(&newVectorLabels, sizeof(uint32_t) * numNewVectors)); - RAFT_CUDA_TRY(cudaMemset(newVectorLabels, 0, sizeof(uint32_t) * numNewVectors)); - uint32_t* clusterSize; // [numClusters,] - RAFT_CUDA_TRY(cudaMallocManaged(&clusterSize, sizeof(uint32_t) * oldDesc->numClusters)); - RAFT_CUDA_TRY(cudaMemset(clusterSize, 0, sizeof(uint32_t) * oldDesc->numClusters)); - _cuann_kmeans_predict_MP(handle, - clusterCenters, - oldDesc->numClusters, - oldDesc->dimDataset, - newVectors, - dtype, - numNewVectors, - newVectorLabels, - oldDesc->metric, - true, - clusterSize, - false /* do not update clusterCenters */); - -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - { - const int _num_show = 10; - fprintf(stderr, "# numNewVectors: %u\n", numNewVectors); - fprintf(stderr, "# newVectorLabels: "); - for (uint32_t i = 0; i < numNewVectors; i++) { - if ((i < _num_show) || (numNewVectors - i <= _num_show)) { - fprintf(stderr, "%u, ", newVectorLabels[i]); - } else if (i == _num_show) { - fprintf(stderr, "..., "); - } - } - fprintf(stderr, "\n"); - } - { - const int _num_show = 10; - fprintf(stderr, "# oldDesc->numClusters: %u\n", oldDesc->numClusters); - fprintf(stderr, "# clusterSize: "); - int _sum = 0; - for (uint32_t i = 0; i < oldDesc->numClusters; i++) { - _sum += clusterSize[i]; - if ((i < _num_show) || (oldDesc->numClusters - i <= _num_show)) { - fprintf(stderr, "%u, ", clusterSize[i]); - } else if (i == _num_show) { - fprintf(stderr, "..., "); - } - } - fprintf(stderr, "\n"); - fprintf(stderr, "# _sum: %d\n", _sum); - } -#endif - - // - // Make cluster_offsets, originalNumbers - // - uint32_t maxClusterSize = 0; - uint32_t* cluster_offsets; // [numClusters + 1] - uint32_t* originalNumbers; // [numNewVectors] - cluster_offsets = (uint32_t*)malloc(sizeof(uint32_t) * (oldDesc->numClusters + 1)); - originalNumbers = (uint32_t*)malloc(sizeof(uint32_t) * numNewVectors); - // cluster_offsets - cluster_offsets[0] = 0; - for (uint32_t l = 0; l < oldDesc->numClusters; l++) { - cluster_offsets[l + 1] = cluster_offsets[l] + clusterSize[l]; - maxClusterSize = max(maxClusterSize, clusterSize[l]); - } - RAFT_EXPECTS(cluster_offsets[oldDesc->numClusters] == numNewVectors, - "cluster sizes do not add up."); - // originalNumbers - for (uint32_t i = 0; i < numNewVectors; i++) { - uint32_t l = newVectorLabels[i]; - originalNumbers[cluster_offsets[l]] = i; - cluster_offsets[l] += 1; - } - // Recover cluster_offsets - for (uint32_t l = 0; l < oldDesc->numClusters; l++) { - cluster_offsets[l] -= clusterSize[l]; - } - - // - // Compute PQ code for new vectors - // - uint8_t* pqDataset; // [numNewVectors, dimPq * bitPq / 8] - RAFT_CUDA_TRY(cudaMallocManaged( - &pqDataset, sizeof(uint8_t) * numNewVectors * oldDesc->dimPq * oldDesc->bitPq / 8)); - _cuann_compute_PQ_code(handle, - numNewVectors, - oldDesc->dimDataset, - oldDesc->dimRotDataset, - oldDesc->dimPq, - oldDesc->lenPq, - oldDesc->bitPq, - oldDesc->numClusters, - oldDesc->typePqCenter, - maxClusterSize, - clusterCenters, - oldRotationMatrix, - newVectors, - originalNumbers, - clusterSize, - cluster_offsets, - oldPqCenters, - 0, - pqDataset); - RAFT_CUDA_TRY(cudaSetDevice(cuannDevId)); - - // - // Create descriptor for new index - // - auto newDesc = cuannIvfPqCreateDescriptor(); - memcpy(newDesc.get(), oldDesc.get(), sizeof(struct cuannIvfPqDescriptor)); - newDesc->numDataset += numNewVectors; - newDesc->inclusiveSumSortedClusterSize = nullptr; - newDesc->sqsumClusters = nullptr; - newDesc->index_ptr = nullptr; - RAFT_LOG_DEBUG("numDataset: %u -> %u", oldDesc->numDataset, newDesc->numDataset); - - // - // Allocate memory for new index - // - size_t newIndexSize; - cuannIvfPqGetIndexSize(newDesc, &newIndexSize); - RAFT_LOG_DEBUG("indexSize: %lu -> %lu", oldHeader->indexSize, newIndexSize); - RAFT_CUDA_TRY(cudaMallocManaged(&(newDesc->index_ptr), newIndexSize)); - memset(newDesc->index_ptr, 0, newIndexSize); - struct cuannIvfPqIndexHeader* newHeader; - float* newClusterCenters; // [numClusters, dimDatasetExt] - float* newPqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* newPqDataset; // [numDataset, dimPq * bitPq / 8] *** - uint32_t* newOriginalNumbers; // [numDataset] *** - uint32_t* new_cluster_offsets; // [numClusters + 1] *** - float* newRotationMatrix; // [dimDataset, dimRotDataset] - float* newClusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(newDesc, - &newHeader, - &newClusterCenters, - &newPqCenters, - &newPqDataset, - &newOriginalNumbers, - &new_cluster_offsets, - &newRotationMatrix, - &newClusterRotCenters); - - // - // Copy the unchanged parts - // header, clusterCenters, pqCenters, rotationMatrix, clusterRotCenters - // - memcpy(newHeader, oldHeader, sizeof(struct cuannIvfPqIndexHeader)); - { - cuannIvfPqGetIndexSize(newDesc, &(newHeader->indexSize)); - newHeader->numDataset = newDesc->numDataset; - newHeader->numDatasetAdded += numNewVectors; - } - memcpy(newClusterCenters, oldClusterCenters, _cuann_getIndexSize_clusterCenters(oldDesc)); - memcpy(newPqCenters, oldPqCenters, _cuann_getIndexSize_pqCenters(oldDesc)); - memcpy(newRotationMatrix, oldRotationMatrix, _cuann_getIndexSize_rotationMatrix(oldDesc)); - memcpy( - newClusterRotCenters, oldClusterRotCenters, _cuann_getIndexSize_clusterRotCenters(oldDesc)); - - // - // Make new_cluster_offsets - // - maxClusterSize = 0; - new_cluster_offsets[0] = 0; - for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; - new_cluster_offsets[l + 1] = new_cluster_offsets[l]; - new_cluster_offsets[l + 1] += oldClusterSize + clusterSize[l]; - maxClusterSize = max(maxClusterSize, oldClusterSize + clusterSize[l]); - } - { - newDesc->maxClusterSize = maxClusterSize; - newHeader->maxClusterSize = maxClusterSize; - } - RAFT_LOG_DEBUG("maxClusterSize: %u -> %u", oldDesc->maxClusterSize, newDesc->maxClusterSize); - - // - // Make newOriginalNumbers - // - for (uint32_t i = 0; i < numNewVectors; i++) { - originalNumbers[i] += oldDesc->numDataset; - } - for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; - memcpy(newOriginalNumbers + new_cluster_offsets[l], - oldOriginalNumbers + old_cluster_offsets[l], - sizeof(uint32_t) * oldClusterSize); - memcpy(newOriginalNumbers + new_cluster_offsets[l] + oldClusterSize, - originalNumbers + cluster_offsets[l], - sizeof(uint32_t) * clusterSize[l]); - } - - // - // Make newPqDataset - // - size_t unitPqDataset = newDesc->dimPq * newDesc->bitPq / 8; - for (uint32_t l = 0; l < newDesc->numClusters; l++) { - uint32_t oldClusterSize = old_cluster_offsets[l + 1] - old_cluster_offsets[l]; - memcpy(newPqDataset + unitPqDataset * new_cluster_offsets[l], - oldPqDataset + unitPqDataset * old_cluster_offsets[l], - sizeof(uint8_t) * unitPqDataset * oldClusterSize); - memcpy(newPqDataset + unitPqDataset * (new_cluster_offsets[l] + oldClusterSize), - pqDataset + unitPqDataset * cluster_offsets[l], - sizeof(uint8_t) * unitPqDataset * clusterSize[l]); - } - - _cuann_get_inclusiveSumSortedClusterSize( - newDesc, new_cluster_offsets, newClusterCenters, &(newDesc->inclusiveSumSortedClusterSize)); - - // - // Done - // - if (newHeader->numDatasetAdded * 2 > newHeader->numDataset) { - RAFT_LOG_INFO( - "The total number of vectors in the new index" - " is now more than twice the initial number of vectors." - " You may want to re-build the index from scratch." - " (numVectors: %u, numVectorsAdded: %u)", - newHeader->numDataset, - newHeader->numDatasetAdded); - } - - free(originalNumbers); - free(cluster_offsets); - - RAFT_CUDA_TRY(cudaFree(pqDataset)); - RAFT_CUDA_TRY(cudaFree(clusterSize)); - RAFT_CUDA_TRY(cudaFree(newVectorLabels)); - RAFT_CUDA_TRY(cudaFree(clusterCenters)); - - _cuann_set_device(callerDevId); - return newDesc; -} - -// cuannIvfPqSetSearchParameters -inline void cuannIvfPqSetSearchParameters(cuannIvfPqDescriptor_t& desc, - const uint32_t numProbes, - const uint32_t topK) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - RAFT_EXPECTS(numProbes > 0, "numProbes must be larger than zero"); - RAFT_EXPECTS(topK > 0, "topK must be larger than zero"); - RAFT_EXPECTS(numProbes <= desc->numClusters, - "numProbes (%u) must be not larger than numClusters (%u)", - numProbes, - desc->numClusters); - RAFT_EXPECTS(topK <= desc->numDataset, - "topK (%u) must be not larger than numDataset (%u)", - numProbes, - desc->numDataset); - - uint32_t numSamplesWorstCase = desc->numDataset; - if (numProbes < desc->numClusters) { - numSamplesWorstCase = - desc->numDataset - - desc->inclusiveSumSortedClusterSize[desc->numClusters - 1 - numProbes - - desc->_numClustersSize0]; // (*) urgent WA, need to be - // fixed. - } - RAFT_EXPECTS(topK <= numSamplesWorstCase, - "numProbes is too small to get topK results reliably (numProbes: %u, topK: %u, " - "numSamplesWorstCase: %u).", - numProbes, - topK, - numSamplesWorstCase); - desc->numProbes = numProbes; - desc->topK = topK; - desc->maxSamples = desc->inclusiveSumSortedClusterSize[numProbes - 1]; - if (desc->maxSamples % 128) { desc->maxSamples += 128 - (desc->maxSamples % 128); } - desc->internalDistanceDtype = CUDA_R_32F; - desc->smemLutDtype = CUDA_R_32F; - desc->preferredThreadBlockSize = 0; -} - -// cuannIvfPqSetSearchParameters -inline void cuannIvfPqSetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, - cudaDataType_t internalDistanceDtype, - cudaDataType_t smemLutDtype, - const uint32_t preferredThreadBlockSize) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - RAFT_EXPECTS(internalDistanceDtype == CUDA_R_16F || internalDistanceDtype == CUDA_R_32F, - "internalDistanceDtype must be either CUDA_R_16F or CUDA_R_32F"); - RAFT_EXPECTS( - smemLutDtype == CUDA_R_16F || smemLutDtype == CUDA_R_32F || smemLutDtype == CUDA_R_8U, - "smemLutDtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U"); - RAFT_EXPECTS(preferredThreadBlockSize == 256 || preferredThreadBlockSize == 512 || - preferredThreadBlockSize == 1024 || preferredThreadBlockSize == 0, - "preferredThreadBlockSize must be 0, 256, 512 or 1024, but %u is given.", - preferredThreadBlockSize); - desc->internalDistanceDtype = internalDistanceDtype; - desc->smemLutDtype = smemLutDtype; - desc->preferredThreadBlockSize = preferredThreadBlockSize; -} - -// cuannIvfPqGetSearchParameters -inline void cuannIvfPqGetSearchParameters(cuannIvfPqDescriptor_t& desc, - uint32_t* numProbes, - uint32_t* topK) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - *numProbes = desc->numProbes; - *topK = desc->topK; -} - -// cuannIvfPqGetSearchTuningParameters -inline void cuannIvfPqGetSearchTuningParameters(cuannIvfPqDescriptor_t& desc, - cudaDataType_t* internalDistanceDtype, - cudaDataType_t* smemLutDtype, - uint32_t* preferredThreadBlockSize) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - *internalDistanceDtype = desc->internalDistanceDtype; - *smemLutDtype = desc->smemLutDtype; - *preferredThreadBlockSize = desc->preferredThreadBlockSize; -} - -// cuannIvfPqSearch -inline void cuannIvfPqSearch_bufferSize(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - uint32_t maxQueries, - size_t maxWorkspaceSize, - size_t* workspaceSize) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - - size_t max_ws = maxWorkspaceSize; - if (max_ws == 0) { - max_ws = (size_t)1 * 1024 * 1024 * 1024; // default, 1GB - } else { - max_ws = max(max_ws, (size_t)512 * 1024 * 1024); - } - - size_t size_0 = - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries - Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes) + // clusterLabels.. - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistances - _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); - if (size_0 > max_ws) { - maxQueries = maxQueries * max_ws / size_0; - if (maxQueries > 32) { maxQueries -= (maxQueries % 32); } - } - // maxQueries = min(max(maxQueries, 1), 1024); - // maxQueries = min(max(maxQueries, 1), 2048); - maxQueries = min(max(maxQueries, 1), 4096); - desc->maxQueries = maxQueries; - - *workspaceSize = - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // devQueries - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimDatasetExt) + // curQueries - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->dimRotDataset) + // rotQueries - Pow2<128>::roundUp(sizeof(uint32_t) * maxQueries * desc->numProbes); // clusterLabels.. - - max_ws -= *workspaceSize; - desc->maxBatchSize = 1; - while (1) { - uint32_t nextBatchSize = desc->maxBatchSize * max_ws / ivfpq_search_bufferSize(handle, desc); - if (desc->maxBatchSize >= nextBatchSize) break; - desc->maxBatchSize = nextBatchSize; - } - desc->maxBatchSize = min(max(desc->maxBatchSize, 1), maxQueries); - - if (maxQueries > desc->maxBatchSize) { - // Adjust maxBatchSize to reduce workspace size. - uint32_t num = (maxQueries + desc->maxBatchSize - 1) / desc->maxBatchSize; - if (1 < num && num < 5) { desc->maxBatchSize = (maxQueries + num - 1) / num; } - } - - if (1) { - // Adjust maxBatchSize to improve GPU occupancy of topk kernel. - uint32_t numCta_total = getMultiProcessorCount() * 2; - uint32_t numCta_perBatch = numCta_total / desc->maxBatchSize; - float utilization = (float)numCta_perBatch * desc->maxBatchSize / numCta_total; - if (numCta_perBatch > 1 || (numCta_perBatch == 1 && utilization < 0.6)) { - uint32_t numCta_perBatch_1 = numCta_perBatch + 1; - uint32_t maxBatchSize_1 = numCta_total / numCta_perBatch_1; - float utilization_1 = (float)numCta_perBatch_1 * maxBatchSize_1 / numCta_total; - if (utilization < utilization_1) { desc->maxBatchSize = maxBatchSize_1; } - } - } - - size_t size_1 = - Pow2<128>::roundUp(sizeof(float) * maxQueries * desc->numClusters) + // QCDistance - _cuann_find_topk_bufferSize(handle, desc->numProbes, maxQueries, desc->numClusters); - size_t size_2 = ivfpq_search_bufferSize(handle, desc); - *workspaceSize += max(size_1, size_2); - - RAFT_LOG_TRACE("maxQueries: %u", maxQueries); - RAFT_LOG_TRACE("maxBatchSize: %u", desc->maxBatchSize); - RAFT_LOG_DEBUG( - "workspaceSize: %lu (%.3f GiB)", *workspaceSize, (float)*workspaceSize / 1024 / 1024 / 1024); -} - -template -void cuannIvfPqSearch(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const T* queries, /* [numQueries, dimDataset], host or device pointer */ - uint32_t numQueries, - uint64_t* neighbors, /* [numQueries, topK], device pointer */ - float* distances, /* [numQueries, topK], device pointer */ - void* workspace) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - int orgDevId = _cuann_set_device(handle.get_device()); - - cudaDataType_t dtype; - if constexpr (std::is_same_v) { - dtype = CUDA_R_32F; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_8U; - } else if constexpr (std::is_same_v) { - dtype = CUDA_R_8I; - } else { - static_assert( - std::is_same_v || std::is_same_v || std::is_same_v, - "unsupported type"); - } - - struct cuannIvfPqIndexHeader* header; - float* clusterCenters; // [numClusters, dimDatasetExt] - float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* originalNumbers; // [numDataset] - uint32_t* cluster_offsets; // [numClusters + 1] - float* rotationMatrix; // [dimDataset, dimRotDataset] - float* clusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(desc, - &header, - &clusterCenters, - &pqCenters, - &pqDataset, - &originalNumbers, - &cluster_offsets, - &rotationMatrix, - &clusterRotCenters); - // - void* devQueries; // [maxQueries, dimDatasetExt] - float* curQueries; // [maxQueries, dimDatasetExt] - float* rotQueries; // [maxQueries, dimRotDataset] - uint32_t* clusterLabelsToProbe; // [maxQueries, numProbes] - float* QCDistances; // [maxQueries, numClusters] - void* topkWorkspace; - void* searchWorkspace; - devQueries = (void*)workspace; - curQueries = (float*)((uint8_t*)devQueries + - Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); - rotQueries = (float*)((uint8_t*)curQueries + - Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimDatasetExt)); - clusterLabelsToProbe = - (uint32_t*)((uint8_t*)rotQueries + - Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->dimRotDataset)); - // - QCDistances = (float*)((uint8_t*)clusterLabelsToProbe + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); - topkWorkspace = (void*)((uint8_t*)QCDistances + - Pow2<128>::roundUp(sizeof(float) * desc->maxQueries * desc->numClusters)); - // - searchWorkspace = - (void*)((uint8_t*)clusterLabelsToProbe + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxQueries * desc->numProbes)); - - void (*_ivfpq_search)(const handle_t&, - cuannIvfPqDescriptor_t&, - uint32_t, - const float*, - const float*, - const uint8_t*, - const uint32_t*, - const uint32_t*, - const uint32_t*, - const float*, - uint64_t*, - float*, - void*); - if (desc->internalDistanceDtype == CUDA_R_16F) { - if (desc->smemLutDtype == CUDA_R_16F) { - _ivfpq_search = ivfpq_search; - } else if (desc->smemLutDtype == CUDA_R_8U) { - _ivfpq_search = ivfpq_search>; - } else { - _ivfpq_search = ivfpq_search; - } - } else { - if (desc->smemLutDtype == CUDA_R_16F) { - _ivfpq_search = ivfpq_search; - } else if (desc->smemLutDtype == CUDA_R_8U) { - _ivfpq_search = ivfpq_search>; - } else { - _ivfpq_search = ivfpq_search; - } - } - - switch (detail::utils::check_pointer_residency(neighbors, distances)) { - case detail::utils::pointer_residency::device_only: - case detail::utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("output pointers must be accessible from the device."); - } - - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, queries)); - - for (uint32_t i = 0; i < numQueries; i += desc->maxQueries) { - uint32_t nQueries = min(desc->maxQueries, numQueries - i); - - float fillValue = 0.0; - if (desc->metric != raft::distance::DistanceType::InnerProduct) { fillValue = 1.0 / -2.0; } - float divisor = 1.0; - if (desc->dtypeDataset == CUDA_R_8U) { - divisor = 256.0; - } else if (desc->dtypeDataset == CUDA_R_8I) { - divisor = 128.0; - } - if (dtype == CUDA_R_32F) { - float* ptrQueries = (float*)queries + ((uint64_t)(desc->dimDataset) * i); - if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(float) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream())); - ptrQueries = (float*)devQueries; - } - _cuann_copy_fill(nQueries, - desc->dimDataset, - ptrQueries, - desc->dimDataset, - curQueries, - desc->dimDatasetExt, - fillValue, - divisor, - handle.get_stream()); - } else if (dtype == CUDA_R_8U) { - uint8_t* ptrQueries = (uint8_t*)queries + ((uint64_t)(desc->dimDataset) * i); - if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(uint8_t) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream())); - ptrQueries = (uint8_t*)devQueries; - } - _cuann_copy_fill(nQueries, - desc->dimDataset, - ptrQueries, - desc->dimDataset, - curQueries, - desc->dimDatasetExt, - fillValue, - divisor, - handle.get_stream()); - } else if (dtype == CUDA_R_8I) { - int8_t* ptrQueries = (int8_t*)queries + ((uint64_t)(desc->dimDataset) * i); - if (attr.type != cudaMemoryTypeDevice && attr.type != cudaMemoryTypeManaged) { - RAFT_CUDA_TRY(cudaMemcpyAsync(devQueries, - ptrQueries, - sizeof(int8_t) * nQueries * desc->dimDataset, - cudaMemcpyHostToDevice, - handle.get_stream())); - ptrQueries = (int8_t*)devQueries; - } - _cuann_copy_fill(nQueries, - desc->dimDataset, - ptrQueries, - desc->dimDataset, - curQueries, - desc->dimDatasetExt, - fillValue, - divisor, - handle.get_stream()); - } - - float alpha; - float beta; - uint32_t gemmK = desc->dimDataset; - if (desc->metric == distance::DistanceType::InnerProduct) { - alpha = -1.0; - beta = 0.0; - } else { - alpha = -2.0; - beta = 0.0; - gemmK = desc->dimDataset + 1; - RAFT_EXPECTS(gemmK <= desc->dimDatasetExt, "unexpected gemmK or dimDatasetExt"); - } - linalg::gemm(handle, - true, - false, - desc->numClusters, - nQueries, - gemmK, - &alpha, - clusterCenters, - desc->dimDatasetExt, - curQueries, - desc->dimDatasetExt, - &beta, - QCDistances, - desc->numClusters, - handle.get_stream()); - - // Rotate queries - alpha = 1.0; - beta = 0.0; - linalg::gemm(handle, - true, - false, - desc->dimRotDataset, - nQueries, - desc->dimDataset, - &alpha, - rotationMatrix, - desc->dimDataset, - curQueries, - desc->dimDatasetExt, - &beta, - rotQueries, - desc->dimRotDataset, - handle.get_stream()); - - // Select neighbor clusters for each query. - _cuann_find_topk(handle, - desc->numProbes, - nQueries, - desc->numClusters, - NULL, - QCDistances, - clusterLabelsToProbe, - topkWorkspace, - false); - - for (uint32_t j = 0; j < nQueries; j += desc->maxBatchSize) { - uint32_t batchSize = min(desc->maxBatchSize, nQueries - j); - _ivfpq_search(handle, - desc, - batchSize, - clusterRotCenters, - pqCenters, - pqDataset, - originalNumbers, - cluster_offsets, - clusterLabelsToProbe + ((uint64_t)(desc->numProbes) * j), - rotQueries + ((uint64_t)(desc->dimRotDataset) * j), - neighbors + ((uint64_t)(desc->topK) * (i + j)), - distances + ((uint64_t)(desc->topK) * (i + j)), - searchWorkspace); - } - } - - _cuann_set_device(orgDevId); -} - -// -template -__device__ inline float ivfpq_compute_score( - uint32_t dimPq, - uint32_t iDataset, - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const smemLutDtype* preCompScores, // [dimPq, 1 << bitPq] - bool earlyStop, - float kth_score = FLT_MAX) -{ - float score = 0.0; - constexpr uint32_t bitT = sizeof(T) * 8; - const T* headPqDataset = (T*)(pqDataset + (uint64_t)iDataset * (dimPq * bitPq / 8)); - for (int j = 0; j < dimPq / vecLen; j += 1) { - T pqCode = headPqDataset[0]; - headPqDataset += 1; - uint32_t bitLeft = bitT; -#pragma unroll vecLen - for (int k = 0; k < vecLen; k += 1) { - uint8_t code = pqCode; - if (bitLeft > bitPq) { - // This condition is always true here (to make the compiler happy) - if constexpr (bitT > bitPq) { pqCode >>= bitPq; } - bitLeft -= bitPq; - } else { - if (k < vecLen - 1) { - pqCode = headPqDataset[0]; - headPqDataset += 1; - } - code |= (pqCode << bitLeft); - pqCode >>= (bitPq - bitLeft); - bitLeft += (bitT - bitPq); - } - code &= (1 << bitPq) - 1; - score += (float)preCompScores[code]; - preCompScores += (1 << bitPq); - - if (earlyStop && (vecLen > 8) && ((k % 8) == 0)) { - if (score > kth_score) { return FLT_MAX; } - } - } - if (earlyStop && (vecLen <= 8)) { - if (score > kth_score) { return FLT_MAX; } - } - } - return score; -} - -// -template -__device__ inline void warp_merge(K& key, bool acending = true, int group_size = 32) -{ - int lane_id = threadIdx.x % 32; - for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { - bool direction = ((lane_id & mask) == 0); - K opp_key = __shfl_xor_sync(0xffffffff, key, mask); - if ((acending == direction) == (key > opp_key)) { key = opp_key; } - } -} - -// -template -__device__ inline void warp_merge(K& key, V& val, bool acending = true, int group_size = 32) -{ - int lane_id = threadIdx.x % 32; - for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { - bool direction = ((lane_id & mask) == 0); - K opp_key = __shfl_xor_sync(0xffffffff, key, mask); - V opp_val = __shfl_xor_sync(0xffffffff, val, mask); - if ((acending == direction) == ((key > opp_key) || ((key == opp_key) && (val > opp_val)))) { - key = opp_key; - val = opp_val; - } - } -} - -// -template -__device__ inline void warp_sort(K& key, bool acending = true) -{ - int lane_id = threadIdx.x % 32; - for (int group_size = 2; group_size <= 32; group_size <<= 1) { - bool direction = ((lane_id & group_size) == 0); - if ((group_size == 32) && (!acending)) { direction = !direction; } - warp_merge(key, direction, group_size); - } -} - -// -template -__device__ inline void warp_sort(K& key, V& val, bool acending = true) -{ - int lane_id = threadIdx.x % 32; - for (int group_size = 2; group_size <= 32; group_size <<= 1) { - bool direction = ((lane_id & group_size) == 0); - if ((group_size == 32) && (!acending)) { direction = !direction; } - warp_merge(key, val, direction, group_size); - } -} - -// -template -__device__ inline void swap(T& val1, T& val2) -{ - T val0 = val1; - val1 = val2; - val2 = val0; -} - -// -template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) -{ - if ((key1 > key2) || ((key1 == key2) && (val1 > val2))) { - swap(key1, key2); - swap(val1, val2); - return true; - } - return false; -} - -// -template -__device__ inline bool swap_if_needed(K& key1, K& key2) -{ - if (key1 > key2) { - swap(key1, key2); - return true; - } - return false; -} - -// -template -__device__ inline T max_value_of(); -template <> -__device__ inline float max_value_of() -{ - return FLT_MAX; -} -template <> -__device__ inline uint32_t max_value_of() -{ - return ~0u; -} - -// -template -class BlockTopk { - public: - __device__ BlockTopk(uint32_t topk, K* ptr_kth_key) : _topk(topk), _lane_id(threadIdx.x % 32) - { -#pragma unroll - for (int i = 0; i < depth; i++) { - _key[i] = max_value_of(); - _val[i] = max_value_of(); - } - _nfill = 0; - _init_buf(); - _ptr_kth_key = ptr_kth_key; - if (_ptr_kth_key) { - _kth_key = _ptr_kth_key[0]; - } else { - _kth_key = max_value_of(); - } - // __syncthreads(); - } - - __device__ inline K key(int i) { return _key[i]; } - - __device__ inline V val(int i) { return _val[i]; } - - __device__ inline K kth_key() { return _kth_key; } - - __device__ void add(K key, V val) - { - uint32_t mask = __ballot_sync(0xffffffff, (key < _kth_key)); - if (mask == 0) { return; } - uint32_t nvalid = __popc(mask); - if (_buf_nvalid + nvalid > 32) { - _add(_buf_key, _buf_val); - _init_buf(); - if (_ptr_kth_key) { _kth_key = min(_kth_key, _ptr_kth_key[0]); } - } - _push_buf(key, val, mask, nvalid); - } - - __device__ void finalize() - { - if (_buf_nvalid > 0) { _add(_buf_key, _buf_val); } - _merge(); - } - - protected: - K _key[depth]; - V _val[depth]; - K* _ptr_kth_key; - K _kth_key; - uint32_t _nfill; // 0 <= _nfill <= depth - K _buf_key; - V _buf_val; - uint32_t _buf_nvalid; // 0 <= _buf_nvalid <= 32 - - const uint32_t _topk; - const uint32_t _lane_id; - - __device__ inline void _init_buf() - { - _buf_nvalid = 0; - _buf_key = max_value_of(); - _buf_val = max_value_of(); - } - - __device__ inline void _adjust_nfill() - { -#pragma unroll - for (int j = 1; j < depth; j++) { - if (_nfill == depth - j + 1) { - if (__shfl_sync(0xffffffff, _key[depth - j], 0) <= _kth_key) { return; } - _nfill = depth - j; - } - } - } - - __device__ inline void _push_buf(K key, V val, uint32_t mask, uint32_t nvalid) - { - int i = 0; - if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { - int j = _lane_id - _buf_nvalid; - while (j > 0) { - i = __ffs(mask) - 1; - mask ^= (0x1u << i); - j -= 1; - } - i = __ffs(mask) - 1; - } - K temp_key = __shfl_sync(0xffffffff, key, i); - K temp_val = __shfl_sync(0xffffffff, val, i); - if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { - _buf_key = temp_key; - _buf_val = temp_val; - } - _buf_nvalid += nvalid; - } - - __device__ inline void _add(K key, V val) - { - if (_nfill == 0) { - warp_sort(key, val); - _key[0] = key; - _val[0] = val; - } else if (_nfill == 1) { - warp_sort(key, val, false); - swap_if_needed(_key[0], key, _val[0], val); - if (depth > 1) { - _key[1] = key; - _val[1] = val; - warp_merge(_key[1], _val[1]); - } - warp_merge(_key[0], _val[0]); - } else if ((depth >= 2) && (_nfill == 2)) { - warp_sort(key, val, false); - swap_if_needed(_key[1], key, _val[1], val); - if (depth > 2) { - _key[2] = key; - _val[2] = val; - warp_merge(_key[2], _val[2]); - } - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if ((depth >= 3) && (_nfill == 3)) { - warp_sort(key, val, false); - swap_if_needed(_key[2], key, _val[2], val); - if (depth > 3) { - _key[3] = key; - _val[3] = val; - warp_merge(_key[3], _val[3]); - } - warp_merge(_key[2], _val[2], false); - swap_if_needed(_key[1], _key[2], _val[1], _val[2]); - warp_merge(_key[2], _val[2]); - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if ((depth >= 4) && (_nfill == 4)) { - warp_sort(key, val, false); - swap_if_needed(_key[3], key, _val[3], val); - warp_merge(_key[3], _val[3], false); - swap_if_needed(_key[2], _key[3], _val[2], _val[3]); - warp_merge(_key[3], _val[3]); - warp_merge(_key[2], _val[2], false); - swap_if_needed(_key[1], _key[2], _val[1], _val[2]); - warp_merge(_key[2], _val[2]); - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } - _nfill = min(_nfill + 1, depth); - if (_nfill == depth) { - _kth_key = - min(_kth_key, __shfl_sync(0xffffffff, _key[depth - 1], _topk - 1 - (depth - 1) * 32)); - } - } - - __device__ inline void _merge() - { - uint32_t warp_id = threadIdx.x / 32; - uint32_t num_warps = blockDim.x / 32; - K* smem_key = smemArray; - V* smem_val = (V*)(smem_key + (blockDim.x / 2) * depth); - for (int j = num_warps / 2; j > 0; j /= 2) { - __syncthreads(); - if ((j <= warp_id) && (warp_id < (j * 2))) { - uint32_t opp_tid = threadIdx.x - (j * 32); - smem_key[opp_tid] = _key[0]; - smem_val[opp_tid] = _val[0]; - if (depth >= 2) { - smem_key[opp_tid + (j * 32)] = _key[1]; - smem_val[opp_tid + (j * 32)] = _val[1]; - } - if (depth >= 3) { - smem_key[opp_tid + (j * 32) * 2] = _key[2]; - smem_val[opp_tid + (j * 32) * 2] = _val[2]; - } - if (depth >= 4) { - smem_key[opp_tid + (j * 32) * 3] = _key[3]; - smem_val[opp_tid + (j * 32) * 3] = _val[3]; - } - } - __syncthreads(); - if (warp_id < j) { - K key; - V val; - if (depth == 1) { - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[0], key, _val[0], val); - - warp_merge(_key[0], _val[0]); - } else if (depth == 2) { - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[0], key, _val[0], val); - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[1], key, _val[1], val); - - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if (depth == 3) { - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; - swap_if_needed(_key[1], key, _val[1], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[2], key, _val[2], val); - K _key_3_ = smem_key[threadIdx.x ^ 31]; - V _val_3_ = smem_val[threadIdx.x ^ 31]; - - swap_if_needed(_key[0], _key[2], _val[0], _val[2]); - swap_if_needed(_key[1], _key_3_, _val[1], _val_3_); - swap_if_needed(_key[2], _key_3_, _val[2], _val_3_); - warp_merge(_key[2], _val[2]); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if (depth == 4) { - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 3]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 3]; - swap_if_needed(_key[0], key, _val[0], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; - swap_if_needed(_key[1], key, _val[1], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[2], key, _val[2], val); - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[3], key, _val[3], val); - - swap_if_needed(_key[0], _key[2], _val[0], _val[2]); - swap_if_needed(_key[1], _key[3], _val[1], _val[3]); - swap_if_needed(_key[2], _key[3], _val[2], _val[3]); - warp_merge(_key[3], _val[3]); - warp_merge(_key[2], _val[2]); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } - } - } - } -}; - -// -template -__device__ inline void update_approx_global_score(uint32_t topk, - K* my_score, - K* approx_global_score) -{ - if (!__any_sync(0xffffffff, (my_score[0] < approx_global_score[topk - 1]))) { return; } - if (topk <= 32) { - K score = max_value_of(); - if (threadIdx.x < topk) { score = approx_global_score[threadIdx.x]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - - warp_merge(my_score[0]); - if (threadIdx.x < topk) { atomicMin(approx_global_score + threadIdx.x, my_score[0]); } - } else if (topk <= 64) { - K score = max_value_of(); - if (threadIdx.x + 32 < topk) { score = approx_global_score[threadIdx.x + 32]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - score = approx_global_score[threadIdx.x]; - warp_sort(score, false); - swap_if_needed(my_score[1], score); - - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - if (threadIdx.x + 32 < topk) { atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); } - } else if (topk <= 96) { - K score = max_value_of(); - if (threadIdx.x + 64 < topk) { score = approx_global_score[threadIdx.x + 64]; } - warp_sort(score, false); - swap_if_needed(my_score[1], score); - score = approx_global_score[threadIdx.x + 32]; - warp_sort(score, false); - swap_if_needed(my_score[2], score); - score = approx_global_score[threadIdx.x]; - warp_sort(score, false); - K my_score_3_ = score; - - swap_if_needed(my_score[0], my_score[2]); - swap_if_needed(my_score[1], my_score_3_); - swap_if_needed(my_score[2], my_score_3_); - warp_merge(my_score[2]); - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); - if (threadIdx.x + 64 < topk) { atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); } - } else if (topk <= 128) { - K score = max_value_of(); - if (threadIdx.x + 96 < topk) { score = approx_global_score[threadIdx.x + 96]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - score = approx_global_score[threadIdx.x + 64]; - warp_sort(score, false); - swap_if_needed(my_score[1], score); - score = approx_global_score[threadIdx.x + 32]; - warp_sort(score, false); - swap_if_needed(my_score[2], score); - score = approx_global_score[threadIdx.x]; - warp_sort(score, false); - swap_if_needed(my_score[3], score); - - swap_if_needed(my_score[0], my_score[2]); - swap_if_needed(my_score[1], my_score[3]); - swap_if_needed(my_score[2], my_score[3]); - warp_merge(my_score[3]); - warp_merge(my_score[2]); - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); - atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); - if (threadIdx.x + 96 < topk) { atomicMin(approx_global_score + threadIdx.x + 96, my_score[3]); } - } -} - -// -template -__device__ inline outDtype get_out_score(float score, distance::DistanceType metric) -{ - if (metric == distance::DistanceType::InnerProduct) { score = score / 2.0 - 1.0; } - if (sizeof(outDtype) == 2) { score = min(score, FP16_MAX); } - return (outDtype)score; -} - -// -// (*) Restrict the peak GPU occupancy up-to 50% by "__launch_bounds__(1024, 1)", -// as there were cases where performance dropped by a factor of two or more on V100 -// when the peak GPU occupancy was set to more than 50%. -// -template -__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( - uint32_t numDataset, - uint32_t dimDataset, - uint32_t numProbes, - uint32_t dimPq, - uint32_t sizeBatch, - uint32_t maxSamples, - distance::DistanceType metric, - cuannPqCenter_t typePqCenter, - uint32_t topk, - const float* clusterCenters, // [numClusters, dimDataset,] - const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or - // [numClusetrs, 1 << bitPq, lenPq,] - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const uint32_t* clusterIndexPtr, // [numClusters + 1,] - const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] - const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] - const float* _query, // [sizeBatch, dimDataset,] - const uint32_t* indexList, // [sizeBatch * numProbes] - float* _preCompScores, // [...] - float* _topkScores, // [sizeBatch, topk] - outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] - uint32_t* _topkIndex // [sizeBatch, numProbes, topk] -) -{ - const uint32_t lenPq = dimDataset / dimPq; - float* smem = smemArray; - - smemLutDtype* preCompScores = (smemLutDtype*)smem; - float* baseDiff = NULL; - if (preCompBaseDiff) { baseDiff = (float*)(preCompScores + (dimPq << bitPq)); } - bool manageLocalTopk = false; - if (_topkIndex != NULL) { manageLocalTopk = true; } - - uint32_t iBatch; - uint32_t iProbe; - if (indexList == NULL) { - // iBatch = blockIdx.x / numProbes; - // iProbe = blockIdx.x % numProbes; - iBatch = blockIdx.x % sizeBatch; - iProbe = blockIdx.x / sizeBatch; - } else { - iBatch = indexList[blockIdx.x] / numProbes; - iProbe = indexList[blockIdx.x] % numProbes; - } - if (iBatch >= sizeBatch || iProbe >= numProbes) return; - - const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); - const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); - const float* query = _query + (dimDataset * iBatch); - outDtype* output; - uint32_t* topkIndex = NULL; - float* approx_global_score = NULL; - if (manageLocalTopk) { - // Store topk calculated distances to output (and its indices to topkIndex) - output = _output + (topk * (iProbe + (numProbes * iBatch))); - topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + (topk * iBatch); - } else { - // Store all calculated distances to output - output = _output + (maxSamples * iBatch); - } - uint32_t label = clusterLabels[iProbe]; - const float* myClusterCenter = clusterCenters + (dimDataset * label); - const float* myPqCenters; - if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - myPqCenters = pqCenters; - } else { - myPqCenters = pqCenters + (lenPq << bitPq) * label; - } - - if (preCompBaseDiff) { - // Reduce computational complexity by pre-computing the difference - // between the cluster centroid and the query. - for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { - baseDiff[i] = query[i] - myClusterCenter[i]; - } - __syncthreads(); - } - - // Create a lookup table - for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { - uint32_t iPq = i >> bitPq; - uint32_t iCode = i & ((1 << bitPq) - 1); - float score = 0.0; - for (uint32_t j = 0; j < lenPq; j++) { - uint32_t k = j + (lenPq * iPq); - float diff; - if (preCompBaseDiff) { - diff = baseDiff[k]; - } else { - diff = query[k] - myClusterCenter[k]; - } - if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - diff -= myPqCenters[j + (lenPq * i)]; - } else { - diff -= myPqCenters[j + (lenPq * iCode)]; - } - score += diff * diff; - } - preCompScores[i] = (smemLutDtype)score; - } - - uint32_t iSampleBase = 0; - if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } - uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; - uint32_t nSamples32 = nSamples; - if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } - uint32_t iDatasetBase = clusterIndexPtr[label]; - - BlockTopk block_topk( - topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); - __syncthreads(); - - // Compute a distance for each sample - for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { - float score = FLT_MAX; - if (i < nSamples) { - score = ivfpq_compute_score( - dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); - } - if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } - } else { - uint32_t val = i; - block_topk.add(score, val); - } - } - if (!manageLocalTopk) { return; } - block_topk.finalize(); - - // Output topk score and index - uint32_t warp_id = threadIdx.x / 32; - if (warp_id == 0) { - for (int j = 0; j < depth; j++) { - if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); - topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; - } - } - } - - // Approximate update of global topk entries - if (warp_id == 0) { - float my_score[depth]; - for (int j = 0; j < depth; j++) { - my_score[j] = block_topk.key(j); - } - update_approx_global_score(topk, my_score, approx_global_score); - } -} - -// -template -__launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( - uint32_t numDataset, - uint32_t dimDataset, - uint32_t numProbes, - uint32_t dimPq, - uint32_t sizeBatch, - uint32_t maxSamples, - distance::DistanceType metric, - cuannPqCenter_t typePqCenter, - uint32_t topk, - const float* clusterCenters, // [numClusters, dimDataset,] - const float* pqCenters, // [dimPq, 1 << bitPq, lenPq,], or - // [numClusetrs, 1 << bitPq, lenPq,] - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const uint32_t* clusterIndexPtr, // [numClusters + 1,] - const uint32_t* _clusterLabels, // [sizeBatch, numProbes,] - const uint32_t* _chunkIndexPtr, // [sizeBatch, numProbes,] - const float* _query, // [sizeBatch, dimDataset,] - const uint32_t* indexList, // [sizeBatch * numProbes] - float* _preCompScores, // [..., dimPq << bitPq,] - float* _topkScores, // [sizeBatch, topk] - outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] - uint32_t* _topkIndex // [sizeBatch, numProbes, topk] -) -{ - const uint32_t lenPq = dimDataset / dimPq; - - float* preCompScores = _preCompScores + ((dimPq << bitPq) * blockIdx.x); - float* baseDiff = NULL; - if (preCompBaseDiff) { baseDiff = (float*)smemArray; } - bool manageLocalTopk = false; - if (_topkIndex != NULL) { manageLocalTopk = true; } - - for (int ib = blockIdx.x; ib < sizeBatch * numProbes; ib += gridDim.x) { - uint32_t iBatch; - uint32_t iProbe; - if (indexList == NULL) { - // iBatch = ib / numProbes; - // iProbe = ib % numProbes; - iBatch = ib % sizeBatch; - iProbe = ib / sizeBatch; - } else { - iBatch = indexList[ib] / numProbes; - iProbe = indexList[ib] % numProbes; - } - - const uint32_t* clusterLabels = _clusterLabels + (numProbes * iBatch); - const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); - const float* query = _query + (dimDataset * iBatch); - outDtype* output; - uint32_t* topkIndex = NULL; - float* approx_global_score = NULL; - if (manageLocalTopk) { - // Store topk calculated distances to output (and its indices to topkIndex) - output = _output + (topk * (iProbe + (numProbes * iBatch))); - topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + (topk * iBatch); - } else { - // Store all calculated distances to output - output = _output + (maxSamples * iBatch); - } - uint32_t label = clusterLabels[iProbe]; - const float* myClusterCenter = clusterCenters + (dimDataset * label); - const float* myPqCenters; - if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - myPqCenters = pqCenters; - } else { - myPqCenters = pqCenters + (lenPq << bitPq) * label; - } - - if (preCompBaseDiff) { - // Reduce computational complexity by pre-computing the difference - // between the cluster centroid and the query. - for (uint32_t i = threadIdx.x; i < dimDataset; i += blockDim.x) { - baseDiff[i] = query[i] - myClusterCenter[i]; - } - __syncthreads(); - } - - // Create a lookup table - for (uint32_t i = threadIdx.x; i < (dimPq << bitPq); i += blockDim.x) { - uint32_t iPq = i >> bitPq; - uint32_t iCode = i & ((1 << bitPq) - 1); - float score = 0.0; - for (uint32_t j = 0; j < lenPq; j++) { - uint32_t k = j + (lenPq * iPq); - float diff; - if (preCompBaseDiff) { - diff = baseDiff[k]; - } else { - diff = query[k] - myClusterCenter[k]; - } - if (typePqCenter == CUANN_PQ_CENTER_PER_SUBSPACE) { - diff -= myPqCenters[j + (lenPq * i)]; - } else { - diff -= myPqCenters[j + (lenPq * iCode)]; - } - score += diff * diff; - } - preCompScores[i] = score; - } - - uint32_t iSampleBase = 0; - if (iProbe > 0) { iSampleBase = chunkIndexPtr[iProbe - 1]; } - uint32_t nSamples = chunkIndexPtr[iProbe] - iSampleBase; - uint32_t nSamples32 = nSamples; - if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } - uint32_t iDatasetBase = clusterIndexPtr[label]; - - BlockTopk block_topk( - topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); - __syncthreads(); - - // Compute a distance for each sample - for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { - float score = FLT_MAX; - if (i < nSamples) { - score = ivfpq_compute_score( - dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); - } - if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } - } else { - uint32_t val = i; - block_topk.add(score, val); - } - } - __syncthreads(); - if (!manageLocalTopk) { - continue; // for (int ib ...) - } - block_topk.finalize(); - - // Output topk score and index - uint32_t warp_id = threadIdx.x / 32; - if (warp_id == 0) { - for (int j = 0; j < depth; j++) { - if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); - topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; - } - } - } - - // Approximate update of global topk entries - if (warp_id == 0) { - float my_score[depth]; - for (int j = 0; j < depth; j++) { - my_score[j] = block_topk.key(j); - } - update_approx_global_score(topk, my_score, approx_global_score); - } - __syncthreads(); - } -} - -// search -template -inline void ivfpq_search(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - uint32_t numQueries, - const float* clusterCenters, // [numDataset, dimRotDataset] - const float* pqCenters, // [dimPq, 1 << desc->bitPq, lenPq] - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* cluster_offsets, // [numClusters + 1] - const uint32_t* clusterLabelsToProbe, // [numQueries, numProbes] - const float* query, // [numQueries, dimRotDataset] - uint64_t* topkNeighbors, // [numQueries, topK] - float* topkDistances, // [numQueries, topK] - void* workspace) -{ - RAFT_EXPECTS(numQueries <= desc->maxBatchSize, - "number of queries (%u) must be smaller the max batch size (%u)", - numQueries, - desc->maxBatchSize); - - uint32_t* clusterLabelsOut; // [maxBatchSize, numProbes] - uint32_t* indexList; // [maxBatchSize * numProbes] - uint32_t* indexListSorted; // [maxBatchSize * numProbes] - uint32_t* numSamples; // [maxBatchSize,] - void* cubWorkspace; // ... - uint32_t* chunkIndexPtr; // [maxBatchSize, numProbes] - uint32_t* topkSids; // [maxBatchsize, topk] - scoreDtype* similarity; // [maxBatchSize, maxSamples] or - // [maxBatchSize, numProbes, topk] - uint32_t* simTopkIndex; // [maxBatchSize, numProbes, topk] - float* topkScores; // [maxBatchSize, topk] - float* preCompScores = NULL; - void* topkWorkspace; - - clusterLabelsOut = (uint32_t*)workspace; - indexList = - (uint32_t*)((uint8_t*)clusterLabelsOut + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - indexListSorted = - (uint32_t*)((uint8_t*)indexList + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - numSamples = - (uint32_t*)((uint8_t*)indexListSorted + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - cubWorkspace = - (void*)((uint8_t*)numSamples + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize)); - chunkIndexPtr = (uint32_t*)((uint8_t*)cubWorkspace + desc->sizeCubWorkspace); - topkSids = - (uint32_t*)((uint8_t*)chunkIndexPtr + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes)); - similarity = - (scoreDtype*)((uint8_t*)topkSids + - Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); - if (manage_local_topk(desc)) { - topkScores = - (float*)((uint8_t*)similarity + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * - desc->numProbes * desc->topK)); - simTopkIndex = (uint32_t*)((uint8_t*)topkScores + - Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK)); - preCompScores = - (float*)((uint8_t*)simTopkIndex + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * - desc->numProbes * desc->topK)); - } else { - topkScores = NULL; - simTopkIndex = NULL; - preCompScores = - (float*)((uint8_t*)similarity + - Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * desc->maxSamples)); - } - topkWorkspace = - (void*)((uint8_t*)preCompScores + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * - desc->dimPq * (1 << desc->bitPq))); - - // - if (manage_local_topk(desc)) { - dim3 iksThreads(128, 1, 1); - dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); - ivfpq_init_topkScores<<>>( - topkScores, FLT_MAX, numQueries * desc->topK); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - } - - // - dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE - dim3 mcBlocks(numQueries, 1, 1); - ivfpq_make_chunk_index_ptr<<>>( - desc->numProbes, numQueries, cluster_offsets, clusterLabelsToProbe, chunkIndexPtr, numSamples); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - - if (numQueries * desc->numProbes > 256) { - // Sorting index by cluster number (label). - // The goal is to incrase the L2 cache hit rate to read the vectors - // of a cluster by processing the cluster at the same time as much as - // possible. - dim3 psThreads(128, 1, 1); - dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); - ivfpq_prep_sort<<>>(numQueries * desc->numProbes, - indexList); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - - int begin_bit = 0; - int end_bit = sizeof(uint32_t) * 8; - cub::DeviceRadixSort::SortPairs(cubWorkspace, - desc->sizeCubWorkspace, - clusterLabelsToProbe, - clusterLabelsOut, - indexList, - indexListSorted, - numQueries * desc->numProbes, - begin_bit, - end_bit, - handle.get_stream()); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - } else { - indexListSorted = NULL; - } - - // Select a GPU kernel for distance calculation -#define SET_KERNEL1(B, V, T, D) \ - do { \ - static_assert((B * V) % (sizeof(T) * 8) == 0); \ - kernel_no_basediff = ivfpq_compute_similarity; \ - kernel_fast = ivfpq_compute_similarity; \ - kernel_no_smem_lut = ivfpq_compute_similarity_no_smem_lut; \ - } while (0) - -#define SET_KERNEL2(B, M, D) \ - do { \ - RAFT_EXPECTS(desc->dimPq % M == 0, "dimPq must be a multiple of %u", M); \ - if (desc->dimPq % (M * 8) == 0) { \ - SET_KERNEL1(B, (M * 8), uint64_t, D); \ - } else if (desc->dimPq % (M * 4) == 0) { \ - SET_KERNEL1(B, (M * 4), uint32_t, D); \ - } else if (desc->dimPq % (M * 2) == 0) { \ - SET_KERNEL1(B, (M * 2), uint16_t, D); \ - } else if (desc->dimPq % (M * 1) == 0) { \ - SET_KERNEL1(B, (M * 1), uint8_t, D); \ - } \ - } while (0) - -#define SET_KERNEL3(D) \ - do { \ - switch (desc->bitPq) { \ - case 4: SET_KERNEL2(4, 2, D); break; \ - case 5: SET_KERNEL2(5, 8, D); break; \ - case 6: SET_KERNEL2(6, 4, D); break; \ - case 7: SET_KERNEL2(7, 8, D); break; \ - case 8: SET_KERNEL2(8, 1, D); break; \ - } \ - } while (0) - - typedef void (*kernel_t)(uint32_t, - uint32_t, - uint32_t, - uint32_t, - uint32_t, - uint32_t, - distance::DistanceType, - cuannPqCenter_t, - uint32_t, - const float*, - const float*, - const uint8_t*, - const uint32_t*, - const uint32_t*, - const uint32_t*, - const float*, - const uint32_t*, - float*, - float*, - scoreDtype*, - uint32_t*); - kernel_t kernel_no_basediff; - kernel_t kernel_fast; - kernel_t kernel_no_smem_lut; - int depth = 1; - if (manage_local_topk(desc)) { depth = (desc->topK + 31) / 32; } - switch (depth) { - case 1: SET_KERNEL3(1); break; - case 2: SET_KERNEL3(2); break; - case 3: SET_KERNEL3(3); break; - case 4: SET_KERNEL3(4); break; - default: RAFT_FAIL("ivf_pq::search(k = %u): depth value is too big (%d)", desc->topK, depth); - } - RAFT_LOG_DEBUG("ivf_pq::search(k = %u, depth = %d, dim = %u/%u/%u)", - desc->topK, - depth, - desc->dimDataset, - desc->dimRotDataset, - desc->dimPq); - constexpr size_t thresholdSmem = 48 * 1024; - size_t sizeSmem = sizeof(smemLutDtype) * desc->dimPq * (1 << desc->bitPq); - size_t sizeSmemBaseDiff = sizeof(float) * desc->dimRotDataset; - - uint32_t numCTAs = numQueries * desc->numProbes; - int numThreads = 1024; - // desc->preferredThreadBlockSize == 0 means using auto thread block size calculation mode - if (desc->preferredThreadBlockSize == 0) { - constexpr int minThreads = 256; - while (numThreads > minThreads) { - if (numCTAs < uint32_t(getMultiProcessorCount() * (1024 / (numThreads / 2)))) { break; } - if (handle.get_device_properties().sharedMemPerMultiprocessor * 2 / 3 < - sizeSmem * (1024 / (numThreads / 2))) { - break; - } - numThreads /= 2; - } - } else { - numThreads = desc->preferredThreadBlockSize; - } - size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); - sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); - - kernel_t kernel = kernel_no_basediff; - - bool kernel_no_basediff_available = true; - if (sizeSmem > thresholdSmem) { - cudaError_t cudaError = cudaFuncSetAttribute( - kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem); - if (cudaError != cudaSuccess) { - RAFT_EXPECTS( - cudaError == cudaGetLastError(), - "Tried to reset the expected cuda error code, but it didn't match the expectation"); - kernel_no_basediff_available = false; - - // Use "kernel_no_smem_lut" which just uses small amount of shared memory. - kernel = kernel_no_smem_lut; - numThreads = 1024; - size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); - sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); - numCTAs = getMultiProcessorCount(); - } - } - if (kernel_no_basediff_available) { - bool kernel_fast_available = true; - if (sizeSmem + sizeSmemBaseDiff > thresholdSmem) { - cudaError_t cudaError = cudaFuncSetAttribute( - kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, sizeSmem + sizeSmemBaseDiff); - if (cudaError != cudaSuccess) { - RAFT_EXPECTS( - cudaError == cudaGetLastError(), - "Tried to reset the expected cuda error code, but it didn't match the expectation"); - kernel_fast_available = false; - } - } - if (kernel_fast_available) { - int numBlocks_kernel_no_basediff = 0; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks_kernel_no_basediff, kernel_no_basediff, numThreads, sizeSmem)); - - int numBlocks_kernel_fast = 0; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks_kernel_fast, kernel_fast, numThreads, sizeSmem + sizeSmemBaseDiff)); - - // Use "kernel_fast" only if GPU occupancy does not drop - if (numBlocks_kernel_no_basediff == numBlocks_kernel_fast) { - kernel = kernel_fast; - sizeSmem += sizeSmemBaseDiff; - } - } - } - dim3 ctaThreads(numThreads, 1, 1); - dim3 ctaBlocks(numCTAs, 1, 1); - kernel<<>>(desc->numDataset, - desc->dimRotDataset, - desc->numProbes, - desc->dimPq, - numQueries, - desc->maxSamples, - desc->metric, - desc->typePqCenter, - desc->topK, - clusterCenters, - pqCenters, - pqDataset, - cluster_offsets, - clusterLabelsToProbe, - chunkIndexPtr, - query, - indexListSorted, - preCompScores, - topkScores, - (scoreDtype*)similarity, - simTopkIndex); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - - // Select topk vectors for each query - if (simTopkIndex == NULL) { - _cuann_find_topk(handle, - desc->topK, - numQueries, - desc->maxSamples, - numSamples, - (scoreDtype*)similarity, - topkSids, - topkWorkspace); - } else { - _cuann_find_topk(handle, - desc->topK, - numQueries, - (desc->numProbes * desc->topK), - NULL, - (scoreDtype*)similarity, - topkSids, - topkWorkspace); - } -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif - - // - dim3 moThreads(128, 1, 1); - dim3 moBlocks((desc->topK + moThreads.x - 1) / moThreads.x, numQueries, 1); - ivfpq_make_outputs - <<>>(desc->numProbes, - desc->topK, - desc->maxSamples, - numQueries, - cluster_offsets, - originalNumbers, - clusterLabelsToProbe, - chunkIndexPtr, - (scoreDtype*)similarity, - simTopkIndex, - topkSids, - topkNeighbors, - topkDistances); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif +template +inline void search(const handle_t& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr) +{ + return raft::spatial::knn::ivf_pq::detail::search( + handle, params, index, queries, n_queries, k, neighbors, distances, mr); } } // namespace raft::spatial::knn::ivf_pq diff --git a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp new file mode 100644 index 0000000000..f462eef3d2 --- /dev/null +++ b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" + +#include +#include +#include + +namespace raft::spatial::knn::ivf_pq { + +/** A type for specifying how PQ codebooks are created. */ +enum class codebook_gen { + PER_SUBSPACE = 0, + PER_CLUSTER = 1, +}; + +struct index_params : knn::index_params { + /** + * The number of inverted lists (clusters) + * + * Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to + * 10,000. + */ + uint32_t n_lists = 1024; + /** The number of iterations searching for kmeans centers (index building). */ + uint32_t kmeans_n_iters = 20; + /** The fraction of data to use during iterative kmeans building. */ + double kmeans_trainset_fraction = 0.5; + /** + * The bit length of the vector element after compression by PQ. + * + * Possible values: [4, 5, 6, 7, 8]. + * + * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search + * performance, but the lower the recall. + */ + uint32_t pq_bits = 8; + /** + * The dimensionality of the vector after compression by PQ. When zero, an optimal value is + * selected using a heuristic. + * + * NB: `pq_dim * pq_bits` must be a multiple of 8. + * + * Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but + * lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are + * desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. + * For good performance, multiple 32 is desirable. + */ + uint32_t pq_dim = 0; + /** + * If true, dataset and query vectors are rotated by a random rotation matrix created at indexing + * time. + * + * NB: Currently, the rotation matrix is generated on CPU and may measurably increase the indexing + * time. + */ + bool random_rotation = true; + /** How PQ codebooks are created. */ + codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE; +}; + +struct search_params : knn::search_params { + /** The number of clusters to search. */ + uint32_t n_probes = 20; + /** + * Data type of LUT to be created dynamically at search time. + * + * Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U] + * + * The use of low-precision types reduces the amount of shared memory required at search time, so + * fast shared memory kernels can be used even for datasets with large dimansionality. Note that + * the recall is slightly degraded when low-precision type is selected. + */ + cudaDataType_t smem_lut_dtype = CUDA_R_32F; + /** + * Storage data type for distance/similarity computed at search time. + * + * Possible values: [CUDA_R_16F, CUDA_R_32F] + * + * If the performance limiter at search time is device memory access, selecting FP16 will improve + * performance slightly. + */ + cudaDataType_t internal_distance_dtype = CUDA_R_32F; + /** + * Thread block size of the distance calculation kernel at search time. + * When zero, an optimal block size is selected using a heuristic. + * + * Possible values: [0, 256, 512, 1024] + */ + uint32_t preferred_thread_block_size = 0; +}; + +static_assert(std::is_aggregate_v); +static_assert(std::is_aggregate_v); + +namespace detail { + +/* IvfPq */ +struct cuannIvfPqDescriptor { + uint32_t numClusters; + uint32_t numDataset; + uint32_t dimDataset; + uint32_t dimDatasetExt; + uint32_t dimRotDataset; + uint32_t dimPq; + uint32_t bitPq; + distance::DistanceType metric; + codebook_gen typePqCenter; + cudaDataType_t dtypeDataset; + cudaDataType_t internalDistanceDtype; + cudaDataType_t smemLutDtype; + uint32_t indexVersion; + uint32_t maxClusterSize; + uint32_t lenPq; // dimRotDataset / dimPq + uint32_t numProbes; + uint32_t topK; + uint32_t maxQueries; + uint32_t maxBatchSize; + uint32_t maxSamples; + uint32_t* inclusiveSumSortedClusterSize; // [numClusters,] + float* sqsumClusters; // [numClusters,] + size_t sizeCubWorkspace; + uint32_t _numClustersSize0; // (*) urgent WA, need to be fixed + uint32_t preferredThreadBlockSize; + void* index_ptr; +}; +using cuannIvfPqDescriptor_t = + std::unique_ptr>; + +cuannIvfPqDescriptor_t cuannIvfPqCreateDescriptor() +{ + return cuannIvfPqDescriptor_t{[]() { + auto desc = new cuannIvfPqDescriptor{}; + desc->numClusters = 0; + desc->numDataset = 0; + desc->dimDataset = 0; + desc->dimDatasetExt = 0; + desc->dimRotDataset = 0; + desc->dimPq = 0; + desc->bitPq = 0; + desc->numProbes = 0; + desc->topK = 0; + desc->maxQueries = 0; + desc->maxBatchSize = 0; + desc->maxSamples = 0; + desc->inclusiveSumSortedClusterSize = nullptr; + desc->sqsumClusters = nullptr; + desc->index_ptr = nullptr; + return desc; + }(), + [](cuannIvfPqDescriptor* desc) { + if (desc->inclusiveSumSortedClusterSize != nullptr) { + free(desc->inclusiveSumSortedClusterSize); + } + if (desc->sqsumClusters != nullptr) { + RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->sqsumClusters)); + } + if (desc->index_ptr != nullptr) { + RAFT_CUDA_TRY_NO_THROW(cudaFree(desc->index_ptr)); + } + delete desc; + }}; +} + +} // namespace detail + +/** + * @brief IVF-PQ index. + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + */ +template +struct index : knn::index { + static_assert(!raft::is_narrowing_v, + "IdxT must be able to represent all values of uint32_t"); + + public: + /** Dimensionality of the data. */ + [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dim_; } + /** Bit length of the encoded PQ vector element (see index_parameters). */ + [[nodiscard]] constexpr inline auto pq_dim() const noexcept -> uint32_t { return pq_dim_; } + /** Distance metric used for clustering. */ + [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType + { + return metric_; + } + /** Number of clusters/inverted lists. */ + [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t { return n_lists_; } + + inline auto desc() noexcept -> detail::cuannIvfPqDescriptor_t& { return cuann_desc_; } + [[nodiscard]] inline auto desc() const noexcept -> const detail::cuannIvfPqDescriptor_t& + { + return cuann_desc_; + } + + // Don't allow copying the index for performance reasons (try avoiding copying data) + index(const index&) = delete; + index(index&&) = default; + auto operator=(const index&) -> index& = delete; + auto operator=(index&&) -> index& = default; + ~index() = default; + + /** Construct an empty index. It needs to be trained and then populated. */ + index(const handle_t& handle, + raft::distance::DistanceType metric, + uint32_t n_lists, + uint32_t dim, + uint32_t pq_dim = 0) + : knn::index(), + n_lists_(n_lists), + metric_(metric), + dim_(dim), + pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim), + cuann_desc_{detail::cuannIvfPqCreateDescriptor()} + { + check_consistency(); + } + + private: + raft::distance::DistanceType metric_; + uint32_t n_lists_; + uint32_t dim_; + uint32_t pq_dim_; + detail::cuannIvfPqDescriptor_t cuann_desc_; + + /** Throw an error if the index content is inconsistent. */ + void check_consistency() {} + + static inline auto calculate_pq_dim(uint32_t dim) -> uint32_t + { + // If the dimensionality is large enough, we can reduce it to improve performance + if (dim >= 128) { dim /= 2; } + // Round it up to 32 to improve performance. + uint32_t r = raft::alignDown(dim, 32); + if (r > 0) return r; + // If the dimensionality is really low, round it to the closest power-of-two + r = 1; + while ((r << 1) <= dim) { + r = r << 1; + } + return r; + } +}; + +} // namespace raft::spatial::knn::ivf_pq diff --git a/cpp/test/spatial/ann_ivf_pq.cu b/cpp/test/spatial/ann_ivf_pq.cu index 17fafa2b30..11a63cf297 100644 --- a/cpp/test/spatial/ann_ivf_pq.cu +++ b/cpp/test/spatial/ann_ivf_pq.cu @@ -157,123 +157,36 @@ class IvfPqTest : public ::testing::TestWithParam { rmm::device_uvector indices_ivf_pq_dev(queries_size, stream_); { - auto size_1 = uint32_t(ps.num_db_vecs) / 2; - auto size_2 = uint32_t(ps.num_db_vecs) - size_1; + auto size_1 = uint64_t(ps.num_db_vecs) / 2; + auto size_2 = uint64_t(ps.num_db_vecs) - size_1; auto vecs_1 = database.data(); auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim); + rmm::device_uvector db_indices(ps.num_db_vecs, stream_); + sparse::iota_fill(db_indices.data(), uint64_t(ps.num_db_vecs), uint64_t(1), stream_); + handle_.sync_stream(stream_); - auto cuann_desc_1 = ivf_pq::cuannIvfPqCreateDescriptor(); - - // Number of kmeans clusters. - // - // The number of vectors per cluster, or 'numDataset' / 'numClusters', - // should be approximately 1,000 to 10,000. - uint32_t n_clusters = ps.nlist; - // Important parameters of the index to create. - // - // 'bitPq' is the bit length of the vector element after compression by PQ. - // 'dimPq' is the dimensionality of the vector after compression by PQ. - // - // 'bitPq' is 4, 5, 6, 7, or 8. The smaller the 'bitPq', the smaller the - // index size and the better the search performance, but the lower the recall. - // - // Similarly, a smaller 'dimPq' results in a smaller index size and better - // search performance, but lower recall. If 'bitPq' is 8, 'dimPq' can be set - // to any number, but multiple of 8 are desirable for good performance. - // If 'bitPq' is not 8, 'dimPq' must be basically multiple of 8. For good - // performance, multiple 32 is desirable. - // - uint32_t bitPq = 8; - uint32_t dimPq = ps.dim; - if (dimPq >= 128) { - dimPq = raft::alignDown(dimPq / 2, 32); - } else if (dimPq >= 32) { - dimPq = raft::alignDown(dimPq, 32); - } else if (dimPq >= 8) { - dimPq = raft::alignDown(dimPq, 8); - } - // If true, dataset and query vectors are rotated by random rotation matrix - // created at indexing time. - // - bool randomRotation = ps.dim < 1024; // disable for large-dimensional data (CPU intensive) - // Number of iterations for kmeans training. - uint32_t numIterations = 25; - // Specify whether PQ codebooks are created per subspace or per cluster. - ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_SUBSPACE; - // ivf_pq::cuannPqCenter_t typePqCenter = ivf_pq::CUANN_PQ_CENTER_PER_CLUSTER; - ivf_pq::cuannIvfPqSetIndexParameters( - cuann_desc_1, - n_clusters, /* Number of clusters */ - size_1, /* Number of dataset entries */ - uint32_t(ps.dim), /* Dimension of each entry */ - dimPq, /* Dimension of each entry after product quantization */ - bitPq, /* Bit length of PQ */ - ps.metric, - typePqCenter); - - // Build index - ivf_pq::cuannIvfPqBuildIndex( - handle_, - cuann_desc_1, - vecs_1, // dataset - database.data(), // ?kmeans? trainset - uint32_t(ps.num_db_vecs), // size of the trainset (I guess for kmeans) - numIterations, - randomRotation, - true // hierarchialClustering: always true in raft - ); + raft::spatial::knn::ivf_pq::index_params index_params; + raft::spatial::knn::ivf_pq::search_params search_params; + index_params.n_lists = ps.nlist; + index_params.metric = ps.metric; + search_params.n_probes = ps.nprobe; + + auto index = ivf_pq::build(handle_, index_params, vecs_1, size_1, ps.dim); handle_.sync_stream(stream_); - auto cuann_desc_2 = ivf_pq::cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( - handle_, cuann_desc_1, vecs_2, size_2); - - // set search parameters - ivf_pq::cuannIvfPqSetSearchParameters(cuann_desc_2, ps.nprobe, ps.k); - // Data type of LUT to be created dynamically at search time. - // - // The use of low-precision types reduces the amount of shared memory - // required at search time, so fast shared memory kernels can be used even - // for datasets with large dimansionality. Note that the recall is slightly - // degraded when low-precision type is selected. - // - cudaDataType_t smemLutDtype = CUDA_R_32F; - // smemLutDtype = CUDA_R_16F; - // smemLutDtype = CUDA_R_8U; - // Storage data type for distance/similarity computed at search time. - // - // If the performance limiter at search time is device memory access, - // selecting FP16 will improve performance slightly. - // - cudaDataType_t internalDistanceDtype = CUDA_R_32F; - // internalDistanceDtype = CUDA_R_16F; - - // Thread block size of the distance calculation kernel at search time. - // - // If 0, the thread block size is determined automatically. - // - uint32_t preferredThreadBlockSize = 0; // 0, 256, 512, or 1024 - ivf_pq::cuannIvfPqSetSearchTuningParameters( - cuann_desc_2, internalDistanceDtype, smemLutDtype, preferredThreadBlockSize); - // Maximum number of query vectors to search at the same time. - uint32_t batchSize = std::min(ps.num_queries, 32768); - // Maximum device memory size that may be used as workspace at search time. - // maxSearchWorkspaceSize = 0; // default - size_t maxSearchWorkspaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GiB - - // Allocate memory for index - size_t ivf_pq_search_workspace_size; - ivf_pq::cuannIvfPqSearch_bufferSize( - handle_, cuann_desc_2, batchSize, maxSearchWorkspaceSize, &ivf_pq_search_workspace_size); - rmm::device_buffer ivf_pq_search_ws_buf(ivf_pq_search_workspace_size, stream_); + auto index_2 = ivf_pq::extend( + handle_, index, vecs_2, db_indices.data() + size_1, size_2); + handle_.sync_stream(stream_); // finally, search! - cuannIvfPqSearch(handle_, - cuann_desc_2, - search_queries.data(), - ps.num_queries, - indices_ivf_pq_dev.data(), - distances_ivf_pq_dev.data(), - ivf_pq_search_ws_buf.data()); + ivf_pq::search(handle_, + search_params, + index_2, + search_queries.data(), + ps.num_queries, + ps.k, + indices_ivf_pq_dev.data(), + distances_ivf_pq_dev.data()); handle_.sync_stream(stream_); update_host(distances_ivf_pq.data(), distances_ivf_pq_dev.data(), queries_size, stream_); From a74ed5d9b9b3cae0afa40bdc8303146d822414c9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 16 Aug 2022 13:02:25 +0200 Subject: [PATCH 020/140] Cleanup unused functions --- .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 407 ------------------ 1 file changed, 407 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh index d46e3d3443..cdec1779f9 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -857,130 +857,6 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, _cuann_multi_device_free((uint8_t**)predictWorkspaceMP, numDevices); } -// predict labe of dataset (naive CPU version). -// (*) available only for prediction, but not for training. -inline void _cuann_kmeans_predict_CPU(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* dataset, // [numDataset, dimCenters] - cudaDataType_t dtype, - uint32_t numDataset, - uint32_t* labels, // [numDataset] - distance::DistanceType metric) -{ - float multiplier = 1.0; - if (dtype == CUDA_R_8U) { - multiplier = 1.0 / 256.0; - } else if (dtype == CUDA_R_8I) { - multiplier = 1.0 / 128.0; - } - for (uint32_t i = 0; i < numDataset; i++) { - float* vector = (float*)malloc(sizeof(float) * dimCenters); - for (uint32_t j = 0; j < dimCenters; j++) { - if (dtype == CUDA_R_32F) { - vector[j] = ((float*)dataset)[j + (dimCenters * i)]; - } else if (dtype == CUDA_R_8U) { - vector[j] = ((uint8_t*)dataset)[j + (dimCenters * i)]; - vector[j] *= multiplier; - } else if (dtype == CUDA_R_8I) { - vector[j] = ((int8_t*)dataset)[j + (dimCenters * i)]; - vector[j] *= multiplier; - } - } - float best_score; - for (uint32_t l = 0; l < numCenters; l++) { - float score = 0.0; - for (uint32_t j = 0; j < dimCenters; j++) { - if (metric == distance::DistanceType::InnerProduct) { - score -= vector[j] * centers[j + (dimCenters * l)]; - } else { - float diff = vector[j] - centers[j + (dimCenters * l)]; - score += diff * diff; - } - } - if ((l == 0) || (score < best_score)) { - labels[i] = l; - best_score = score; - } - } - free(vector); - } -} - -#define R_FACTOR 8 - -// -template -__global__ void kern_adjust_centers(float* centers, // [numCenters, dimCenters] - uint32_t numCenters, - uint32_t dimCenters, - const void* _dataset, // [numDataet, dimCenters] - uint32_t numDataset, - const uint32_t* labels, // [numDataset] - distance::DistanceType metric, - const uint32_t* clusterSize, // [numCenters] - float threshold, - uint32_t average, - uint32_t ofst, - uint32_t* count) -{ - const T* dataset = (const T*)_dataset; - float divisor = (float)_divisor; - uint32_t l = threadIdx.y + blockDim.y * blockIdx.y; - if (l >= numCenters) return; - if (clusterSize[l] > (int)(average * threshold)) return; - - uint32_t laneId = threadIdx.x % 32; - uint32_t i; - if (laneId == 0) { - do { - uint32_t old = atomicAdd(count, 1); - i = (ofst * (old + 1)) % numDataset; - } while (clusterSize[labels[i]] < average); - } - i = __shfl_sync(0xffffffff, i, 0); - uint32_t li = labels[i]; - float sqsum = 0.0; - for (uint32_t j = laneId; j < dimCenters; j += 32) { - float val = centers[j + (uint64_t)dimCenters * li] * (R_FACTOR - 1); - val += (float)(dataset[j + (uint64_t)dimCenters * i]) / divisor; - val /= R_FACTOR; - sqsum += val * val; - centers[j + (uint64_t)dimCenters * l] = val; - } - if (metric == distance::DistanceType::InnerProduct) { - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8); - sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); - sqsum = sqrt(sqsum); - for (uint32_t j = laneId; j < dimCenters; j += 32) { - centers[j + ((uint64_t)dimCenters * l)] /= sqsum; - } - } -} - -/** - * end of kmeans - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * Start of topk - */ - // #define NUM_THREADS 1024 // DO NOT CHANGE #define STATE_BIT_LENGTH 8 // 0: state not used, 8: state used @@ -2203,25 +2079,6 @@ inline void _cuann_find_topk(const handle_t& handle, } } -/** - * - * End of topk - * - * - * - * - * - * - * - * - * - * - * Start of ivfpq - */ - -// -inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc); - // search template inline void ivfpq_search(const handle_t& handle, @@ -2238,54 +2095,6 @@ inline void ivfpq_search(const handle_t& handle, float* topKDistances, // [topK] void* workspace); -inline void ivfpq_encode(uint32_t numDataset, - uint32_t ldDataset, // (*) ldDataset >= numDataset - uint32_t dimPq, - uint32_t bitPq, // 4 <= bitPq <= 8 - const uint32_t* label, // [dimPq, ldDataset] - uint8_t* output // [numDataset, dimPq] -); - -// -bool manage_local_topk(cuannIvfPqDescriptor_t& desc); -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads); - -// -__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] - float initValue, - uint32_t num); - -// -__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList); - -// -__global__ void ivfpq_make_chunk_index_ptr( - uint32_t numProbes, - uint32_t sizeBatch, - const uint32_t* cluster_offsets, // [numClusters + 1,] - const uint32_t* _clusterLabelsToProbe, // [sizeBatch, numProbes,] - uint32_t* _chunkIndexPtr, // [sizeBetch, numProbes,] - uint32_t* numSamples // [sizeBatch,] -); - -// -template -__global__ void ivfpq_make_outputs(uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const scoreDtype* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - // __device__ inline uint32_t warp_scan(uint32_t x) { @@ -2658,69 +2467,6 @@ inline void ivfpq_encode(uint32_t numDataset, #endif } -// -template __global__ void ivfpq_make_outputs( - uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const float* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - -// -template __global__ void ivfpq_make_outputs( - uint32_t numProbes, - uint32_t topk, - uint32_t maxSamples, - uint32_t sizeBatch, - const uint32_t* clusterIndexPtr, // [numClusters + 1] - const uint32_t* originalNumbers, // [numDataset] - const uint32_t* clusterLabels, // [sizeBatch, numProbes] - const uint32_t* chunkIndexPtr, // [sizeBatch, numProbes] - const half* scores, // [sizeBatch, maxSamples] or - // [sizeBatch, numProbes, topk] - const uint32_t* scoreTopkIndex, // [sizeBatch, numProbes, topk] - const uint32_t* topkSampleIds, // [sizeBatch, topk] - uint64_t* topkNeighbors, // [sizeBatch, topk] - float* topkScores // [sizeBatch, topk] -); - -/** - * End of ivfpq - * - * - * - * - */ - -inline void cuannIvfPqSetIndexParameters( - cuannIvfPqDescriptor_t& desc, - const uint32_t numClusters, /* Number of clusters */ - const uint32_t numDataset, /* Number of dataset entries */ - const uint32_t dimDataset, /* Dimension of each entry */ - const uint32_t dimPq, /* Dimension of each entry after product quantization */ - const uint32_t bitPq, /* Bit length of PQ */ - const distance::DistanceType metric, - const codebook_gen typePqCenter); - -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - distance::DistanceType* metric, - codebook_gen* typePqCenter); - inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size /* bytes of dataset index */); @@ -2796,16 +2542,6 @@ inline void _cuann_get_index_pointers(cuannIvfPqDescriptor_t& desc, (float*)((uint8_t*)(*rotationMatrix) + _cuann_getIndexSize_rotationMatrix(desc)); } -__global__ void kern_get_cluster_size(uint32_t numClusters, - const uint32_t* cluster_offsets, // [numClusters + 1,] - uint32_t* clusterSize // [numClusters,] -) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= numClusters) return; - clusterSize[i] = cluster_offsets[i + 1] - cluster_offsets[i]; -} - template int descending(const void* a, const void* b) { @@ -3303,7 +3039,6 @@ void _cuann_compute_PQ_code(const handle_t& handle, } } -// cuannIvfPqSetIndexParameters inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, const uint32_t numClusters, const uint32_t numDataset, @@ -3350,27 +3085,6 @@ inline void cuannIvfPqSetIndexParameters(cuannIvfPqDescriptor_t& desc, desc->lenPq = desc->dimRotDataset / dimPq; } -// cuannIvfPqGetIndexParameters -inline void cuannIvfPqGetIndexParameters(cuannIvfPqDescriptor_t& desc, - uint32_t* numClusters, - uint32_t* numDataset, - uint32_t* dimDataset, - uint32_t* dimPq, - uint32_t* bitPq, - distance::DistanceType* metric, - codebook_gen* typePqCenter) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - - *numClusters = desc->numClusters; - *numDataset = desc->numDataset; - *dimDataset = desc->dimDataset; - *dimPq = desc->dimPq; - *bitPq = desc->bitPq; - *metric = desc->metric; - *typePqCenter = desc->typePqCenter; -} - // cuannIvfPqGetIndexSize inline void cuannIvfPqGetIndexSize(cuannIvfPqDescriptor_t& desc, size_t* size) { @@ -4079,127 +3793,6 @@ void cuannIvfPqBuildIndex( _cuann_set_device(callerDevId); } -// cuannIvfPqSaveIndex -inline void cuannIvfPqSaveIndex(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const char* fileName) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - int orgDevId = _cuann_set_device(handle.get_device()); - - FILE* fp = fopen(fileName, "w"); - RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); - - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); - RAFT_LOG_DEBUG("indexSize: %lu\n", header->indexSize); - if (fwrite(desc->index_ptr, 1, header->indexSize, fp) != header->indexSize) { - RAFT_FAIL("(%s) failed to save index to file (%s)\n", __func__, fileName); - } - fclose(fp); - - _cuann_set_device(orgDevId); -} - -// cuannIvfPqLoadIndex -inline void cuannIvfPqLoadIndex(const handle_t& handle, - cuannIvfPqDescriptor_t& desc, - const char* fileName) -{ - RAFT_EXPECTS(desc != nullptr, "the descriptor is not initialized."); - int orgDevId = _cuann_set_device(handle.get_device()); - - if (1 /* *index == NULL */) { - FILE* fp = fopen(fileName, "r"); - RAFT_EXPECTS(fp != nullptr, "(%s) failed to open file (%s).", __func__, fileName); - - if (desc->index_ptr != NULL) { RAFT_CUDA_TRY(cudaFree(desc->index_ptr)); } - size_t indexSize; - fread(&indexSize, sizeof(size_t), 1, fp); - RAFT_LOG_DEBUG("indexSize: %lu\n", indexSize); - RAFT_CUDA_TRY(cudaMallocManaged(&(desc->index_ptr), indexSize)); - fseek(fp, 0, SEEK_SET); - if (fread(desc->index_ptr, 1, indexSize, fp) != indexSize) { - RAFT_FAIL("(%s) failed to load index to from file (%s)\n", __func__, fileName); - } - fclose(fp); - - RAFT_CUDA_TRY( - cudaMemAdvise(desc->index_ptr, indexSize, cudaMemAdviseSetReadMostly, handle.get_device())); - } - - struct cuannIvfPqIndexHeader* header = (struct cuannIvfPqIndexHeader*)(desc->index_ptr); - desc->numClusters = header->numClusters; - desc->numDataset = header->numDataset; - desc->dimDataset = header->dimDataset; - desc->dimPq = header->dimPq; - desc->metric = (distance::DistanceType)header->metric; - desc->maxClusterSize = header->maxClusterSize; - desc->dimRotDataset = header->dimRotDataset; - desc->lenPq = desc->dimRotDataset / desc->dimPq; - desc->bitPq = header->bitPq; - desc->typePqCenter = (codebook_gen)header->typePqCenter; - desc->dtypeDataset = (cudaDataType_t)header->dtypeDataset; - desc->dimDatasetExt = header->dimDatasetExt; - desc->indexVersion = header->version; - - float* clusterCenters; // [numClusters, dimDatasetExt] - float* pqCenters; // [dimPq, 1 << bitPq, lenPq], or - // [numClusters, 1 << bitPq, lenPq] - uint8_t* pqDataset; // [numDataset, dimPq * bitPq / 8] - uint32_t* originalNumbers; // [numDataset] - uint32_t* cluster_offsets; // [numClusters + 1] - float* rotationMatrix; // [dimDataset, dimRotDataset] - float* clusterRotCenters; // [numClusters, dimRotDataset] - _cuann_get_index_pointers(desc, - &header, - &clusterCenters, - &pqCenters, - &pqDataset, - &originalNumbers, - &cluster_offsets, - &rotationMatrix, - &clusterRotCenters); - - // - _cuann_get_inclusiveSumSortedClusterSize( - desc, cluster_offsets, clusterCenters, &(desc->inclusiveSumSortedClusterSize)); - - size_t size; - // pqDataset - size = sizeof(uint8_t) * desc->numDataset * desc->dimPq * desc->bitPq / 8; - if (size < handle.get_device_properties().totalGlobalMem) { - RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqDataset, size, handle.get_device())); - } - // clusterCenters - size = sizeof(float) * desc->numClusters * desc->dimDatasetExt; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterCenters, size, handle.get_device())); - // pqCenters - if (desc->typePqCenter == codebook_gen::PER_SUBSPACE) { - size = sizeof(float) * desc->dimPq * (1 << desc->bitPq) * desc->lenPq; - } else { - size = sizeof(float) * desc->numClusters * (1 << desc->bitPq) * desc->lenPq; - } - RAFT_CUDA_TRY(cudaMemPrefetchAsync(pqCenters, size, handle.get_device())); - // originalNumbers - size = sizeof(uint32_t) * desc->numDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(originalNumbers, size, handle.get_device())); - // cluster_offsets - size = sizeof(uint32_t) * (desc->numClusters + 1); - RAFT_CUDA_TRY(cudaMemPrefetchAsync(cluster_offsets, size, handle.get_device())); - // rotationMatrix - if (rotationMatrix != NULL) { - size = sizeof(float) * desc->dimDataset * desc->dimRotDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(rotationMatrix, size, handle.get_device())); - } - // clusterRotCenters - if (clusterRotCenters != NULL) { - size = sizeof(float) * desc->numClusters * desc->dimRotDataset; - RAFT_CUDA_TRY(cudaMemPrefetchAsync(clusterRotCenters, size, handle.get_device())); - } - - _cuann_set_device(orgDevId); -} - template auto cuannIvfPqCreateNewIndexByAddingVectorsToOldIndex( const handle_t& handle, From f75a5b5f82a8903c4475978d9aa927a7a375a450 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 17 Aug 2022 13:25:33 +0200 Subject: [PATCH 021/140] Removed couple helpers and added a few WIP comments --- .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 143 +++++++----------- 1 file changed, 56 insertions(+), 87 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh index cdec1779f9..7f4cae4126 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -18,6 +18,7 @@ #include "../ivf_pq_types.hpp" #include "ann_kmeans_balanced.cuh" #include "ann_utils.cuh" +#include "topk/warpsort_topk.cuh" #include #include @@ -30,6 +31,8 @@ #include #include +#include + /////////////////// #include #include @@ -260,34 +263,6 @@ inline void _cuann_copy_fill(uint32_t nRows, <<>>(nRows, nCols, src, ldSrc, dst, ldDst, fillValue, divisor); } -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const float* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const uint8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); -template void _cuann_copy_fill(uint32_t nRows, - uint32_t nCols, - const int8_t* src, - uint32_t ldSrc, - float* dst, - uint32_t ldDst, - float fillValue, - float divisor, - cudaStream_t stream); - // a -= b __global__ void kern_a_me_b(uint32_t nRows, uint32_t nCols, @@ -2165,24 +2140,6 @@ __global__ void ivfpq_make_chunk_index_ptr( } } -// -__global__ void ivfpq_init_topkScores(float* topkScores, // [num,] - float initValue, - uint32_t num) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= num) return; - topkScores[i] = initValue; -} - -// -__global__ void ivfpq_prep_sort(uint32_t numElement, uint32_t* indexList) -{ - uint32_t i = threadIdx.x + (blockDim.x * blockIdx.x); - if (i >= numElement) return; - indexList[i] = i; -} - // __device__ inline void ivfpq_get_id_dataset(uint32_t iSample, uint32_t numProbes, @@ -4670,10 +4627,18 @@ __device__ inline uint32_t max_value_of() return ~0u; } -// +// depth == kMaxArrLen = Capacity / kWarpWidth; (kWarpWidth fixed to 32) template class BlockTopk { public: + /** + * @param topk - k, must be not greater than depth * 32 + * @param ptr_kth_key - initial value for k-th element; it's never updated here, + * seems to be needed only for optimization. It's read multiple times though. Is it updated + * elsewhere? Perhaps, only when other blocks in the grid finish their execution. + * Also, this parameter is optional; when null, the value is simply not used. + * + */ __device__ BlockTopk(uint32_t topk, K* ptr_kth_key) : _topk(topk), _lane_id(threadIdx.x % 32) { #pragma unroll @@ -4923,27 +4888,34 @@ class BlockTopk { } }; -// +/** + * Update the scores stored in the device memory from the scores in per-thread + * memory as obtained by BlockTopk + * + * TODO: global_output is not volatile and is accessed using both atomics and plain reads... + * + * @param topk + * @param my_score - small per-thread buffer containing scores for this thread + * @param global_output - global scores + */ template -__device__ inline void update_approx_global_score(uint32_t topk, - K* my_score, - K* approx_global_score) +__device__ inline void update_approx_global_score(uint32_t topk, K* my_score, K* global_output) { - if (!__any_sync(0xffffffff, (my_score[0] < approx_global_score[topk - 1]))) { return; } + if (!__any_sync(0xffffffff, (my_score[0] < global_output[topk - 1]))) { return; } if (topk <= 32) { K score = max_value_of(); - if (threadIdx.x < topk) { score = approx_global_score[threadIdx.x]; } + if (threadIdx.x < topk) { score = global_output[threadIdx.x]; } warp_sort(score, false); swap_if_needed(my_score[0], score); warp_merge(my_score[0]); - if (threadIdx.x < topk) { atomicMin(approx_global_score + threadIdx.x, my_score[0]); } + if (threadIdx.x < topk) { atomicMin(global_output + threadIdx.x, my_score[0]); } } else if (topk <= 64) { K score = max_value_of(); - if (threadIdx.x + 32 < topk) { score = approx_global_score[threadIdx.x + 32]; } + if (threadIdx.x + 32 < topk) { score = global_output[threadIdx.x + 32]; } warp_sort(score, false); swap_if_needed(my_score[0], score); - score = approx_global_score[threadIdx.x]; + score = global_output[threadIdx.x]; warp_sort(score, false); swap_if_needed(my_score[1], score); @@ -4951,17 +4923,17 @@ __device__ inline void update_approx_global_score(uint32_t topk, warp_merge(my_score[1]); warp_merge(my_score[0]); - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - if (threadIdx.x + 32 < topk) { atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); } + atomicMin(global_output + threadIdx.x, my_score[0]); + if (threadIdx.x + 32 < topk) { atomicMin(global_output + threadIdx.x + 32, my_score[1]); } } else if (topk <= 96) { K score = max_value_of(); - if (threadIdx.x + 64 < topk) { score = approx_global_score[threadIdx.x + 64]; } + if (threadIdx.x + 64 < topk) { score = global_output[threadIdx.x + 64]; } warp_sort(score, false); swap_if_needed(my_score[1], score); - score = approx_global_score[threadIdx.x + 32]; + score = global_output[threadIdx.x + 32]; warp_sort(score, false); swap_if_needed(my_score[2], score); - score = approx_global_score[threadIdx.x]; + score = global_output[threadIdx.x]; warp_sort(score, false); K my_score_3_ = score; @@ -4973,21 +4945,21 @@ __device__ inline void update_approx_global_score(uint32_t topk, warp_merge(my_score[1]); warp_merge(my_score[0]); - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); - if (threadIdx.x + 64 < topk) { atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); } + atomicMin(global_output + threadIdx.x, my_score[0]); + atomicMin(global_output + threadIdx.x + 32, my_score[1]); + if (threadIdx.x + 64 < topk) { atomicMin(global_output + threadIdx.x + 64, my_score[2]); } } else if (topk <= 128) { K score = max_value_of(); - if (threadIdx.x + 96 < topk) { score = approx_global_score[threadIdx.x + 96]; } + if (threadIdx.x + 96 < topk) { score = global_output[threadIdx.x + 96]; } warp_sort(score, false); swap_if_needed(my_score[0], score); - score = approx_global_score[threadIdx.x + 64]; + score = global_output[threadIdx.x + 64]; warp_sort(score, false); swap_if_needed(my_score[1], score); - score = approx_global_score[threadIdx.x + 32]; + score = global_output[threadIdx.x + 32]; warp_sort(score, false); swap_if_needed(my_score[2], score); - score = approx_global_score[threadIdx.x]; + score = global_output[threadIdx.x]; warp_sort(score, false); swap_if_needed(my_score[3], score); @@ -5000,10 +4972,10 @@ __device__ inline void update_approx_global_score(uint32_t topk, warp_merge(my_score[1]); warp_merge(my_score[0]); - atomicMin(approx_global_score + threadIdx.x, my_score[0]); - atomicMin(approx_global_score + threadIdx.x + 32, my_score[1]); - atomicMin(approx_global_score + threadIdx.x + 64, my_score[2]); - if (threadIdx.x + 96 < topk) { atomicMin(approx_global_score + threadIdx.x + 96, my_score[3]); } + atomicMin(global_output + threadIdx.x, my_score[0]); + atomicMin(global_output + threadIdx.x + 32, my_score[1]); + atomicMin(global_output + threadIdx.x + 64, my_score[2]); + if (threadIdx.x + 96 < topk) { atomicMin(global_output + threadIdx.x + 96, my_score[3]); } } } @@ -5138,6 +5110,8 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } uint32_t iDatasetBase = clusterIndexPtr[label]; + // topk::block_sort + // block_topk(topk, ___); BlockTopk block_topk( topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); __syncthreads(); @@ -5158,6 +5132,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( } if (!manageLocalTopk) { return; } block_topk.finalize(); + // block_topk.done(); // Output topk score and index uint32_t warp_id = threadIdx.x / 32; @@ -5369,7 +5344,8 @@ inline void ivfpq_search(const handle_t& handle, scoreDtype* similarity; // [maxBatchSize, maxSamples] or // [maxBatchSize, numProbes, topk] uint32_t* simTopkIndex; // [maxBatchSize, numProbes, topk] - float* topkScores; // [maxBatchSize, topk] + /* Preset with the dummy value and only accessed within the main kernel. */ + float* topkScores; // [maxBatchSize, topk] float* preCompScores = NULL; void* topkWorkspace; @@ -5414,13 +5390,10 @@ inline void ivfpq_search(const handle_t& handle, // if (manage_local_topk(desc)) { - dim3 iksThreads(128, 1, 1); - dim3 iksBlocks(((numQueries * desc->topK) + iksThreads.x - 1) / iksThreads.x, 1, 1); - ivfpq_init_topkScores<<>>( - topkScores, FLT_MAX, numQueries * desc->topK); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif + thrust::fill_n(handle.get_thrust_policy(), + thrust::device_pointer_cast(topkScores), + numQueries * desc->topK, + FLT_MAX); } // @@ -5437,13 +5410,9 @@ inline void ivfpq_search(const handle_t& handle, // The goal is to incrase the L2 cache hit rate to read the vectors // of a cluster by processing the cluster at the same time as much as // possible. - dim3 psThreads(128, 1, 1); - dim3 psBlocks((numQueries * desc->numProbes + psThreads.x - 1) / psThreads.x, 1, 1); - ivfpq_prep_sort<<>>(numQueries * desc->numProbes, - indexList); -#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) - handle.sync_stream(); -#endif + thrust::sequence(handle.get_thrust_policy(), + thrust::device_pointer_cast(indexList), + thrust::device_pointer_cast(indexList + numQueries * desc->numProbes)); int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; From f8ea503cdba8af415267dce31062248556560fee Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 17 Aug 2022 15:16:59 +0200 Subject: [PATCH 022/140] Fixing small warnings to make Release config compilable --- .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh index 7f4cae4126..a273a07d38 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -771,7 +771,7 @@ inline void _cuann_kmeans_predict_MP(const handle_t& handle, uint64_t d0 = (uint64_t)numDataset * (devId) / numDevices; uint64_t d1 = (uint64_t)numDataset * (devId + 1) / numDevices; uint64_t nDataset = d1 - d0; - void* ptrDataset; + void* ptrDataset = nullptr; if (dtype == CUDA_R_32F) { ptrDataset = (void*)((float*)dataset + (uint64_t)dimCenters * d0); } else if (dtype == CUDA_R_8U) { @@ -1902,6 +1902,8 @@ inline void _cuann_find_topk(const handle_t& handle, cta_kernel = kern_topk_cta_11; } else if (vecLen == 1) { cta_kernel = kern_topk_cta_11; + } else { + RAFT_FAIL("Unexpected vecLen (%d)", vecLen); } cta_kernel<<>>( topK, sizeBatch, maxSamples, numSamples, (const uint32_t*)samples, state, labels); @@ -2035,6 +2037,8 @@ inline void _cuann_find_topk(const handle_t& handle, cta_kernel = kern_topk_cta_8; } else if (vecLen == 1) { cta_kernel = kern_topk_cta_8; + } else { + RAFT_FAIL("Unexpected vecLen (%d)", vecLen); } cta_kernel<<>>( topK, sizeBatch, maxSamples, numSamples, (const uint16_t*)samples, state, labels); @@ -2807,11 +2811,11 @@ void _cuann_compute_PQ_code(const handle_t& handle, _cuann_kmeans_predict_bufferSize((1 << bitPq), lenPq, max(maxClusterSize, maxTrainset)), "pqPredictWorkspace"); - uint32_t** rotVectorLabels; // [numDevices][maxClusterSize, dimPq,] - uint32_t** pqClusterSize; // [numDevices][1 << bitPq,] - uint32_t** wsKAC; // [numDevices][1] - float** myPqCenters; // [numDevices][1 << bitPq, lenPq] - float** myPqCentersTemp; // [numDevices][1 << bitPq, lenPq] + uint32_t** rotVectorLabels = nullptr; // [numDevices][maxClusterSize, dimPq,] + uint32_t** pqClusterSize = nullptr; // [numDevices][1 << bitPq,] + uint32_t** wsKAC = nullptr; // [numDevices][1] + float** myPqCenters = nullptr; // [numDevices][1 << bitPq, lenPq] + float** myPqCentersTemp = nullptr; // [numDevices][1 << bitPq, lenPq] if ((numIterations > 0) && (typePqCenter == codebook_gen::PER_CLUSTER)) { memset(pqCenters, 0, sizeof(float) * numClusters * (1 << bitPq) * lenPq); rotVectorLabels = @@ -3556,7 +3560,7 @@ void cuannIvfPqBuildIndex( for (uint32_t i = 0; i < numTrainset; i++) { uint32_t l = trainsetLabels[i]; for (uint32_t j = 0; j < desc->dimRotDataset; j++) { - float val; + float val = FLT_MAX; if (dtype == CUDA_R_32F) { val = _cuann_dot(desc->dimDataset, From 140f186b8264ad722d4332e93cb028eb755b1dd3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 18 Aug 2022 09:35:40 +0200 Subject: [PATCH 023/140] Don't store all k scores in the temporary buffer --- .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 121 ++---------------- 1 file changed, 10 insertions(+), 111 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh index a273a07d38..6ccbc089fb 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -4892,97 +4892,6 @@ class BlockTopk { } }; -/** - * Update the scores stored in the device memory from the scores in per-thread - * memory as obtained by BlockTopk - * - * TODO: global_output is not volatile and is accessed using both atomics and plain reads... - * - * @param topk - * @param my_score - small per-thread buffer containing scores for this thread - * @param global_output - global scores - */ -template -__device__ inline void update_approx_global_score(uint32_t topk, K* my_score, K* global_output) -{ - if (!__any_sync(0xffffffff, (my_score[0] < global_output[topk - 1]))) { return; } - if (topk <= 32) { - K score = max_value_of(); - if (threadIdx.x < topk) { score = global_output[threadIdx.x]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - - warp_merge(my_score[0]); - if (threadIdx.x < topk) { atomicMin(global_output + threadIdx.x, my_score[0]); } - } else if (topk <= 64) { - K score = max_value_of(); - if (threadIdx.x + 32 < topk) { score = global_output[threadIdx.x + 32]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - score = global_output[threadIdx.x]; - warp_sort(score, false); - swap_if_needed(my_score[1], score); - - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(global_output + threadIdx.x, my_score[0]); - if (threadIdx.x + 32 < topk) { atomicMin(global_output + threadIdx.x + 32, my_score[1]); } - } else if (topk <= 96) { - K score = max_value_of(); - if (threadIdx.x + 64 < topk) { score = global_output[threadIdx.x + 64]; } - warp_sort(score, false); - swap_if_needed(my_score[1], score); - score = global_output[threadIdx.x + 32]; - warp_sort(score, false); - swap_if_needed(my_score[2], score); - score = global_output[threadIdx.x]; - warp_sort(score, false); - K my_score_3_ = score; - - swap_if_needed(my_score[0], my_score[2]); - swap_if_needed(my_score[1], my_score_3_); - swap_if_needed(my_score[2], my_score_3_); - warp_merge(my_score[2]); - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(global_output + threadIdx.x, my_score[0]); - atomicMin(global_output + threadIdx.x + 32, my_score[1]); - if (threadIdx.x + 64 < topk) { atomicMin(global_output + threadIdx.x + 64, my_score[2]); } - } else if (topk <= 128) { - K score = max_value_of(); - if (threadIdx.x + 96 < topk) { score = global_output[threadIdx.x + 96]; } - warp_sort(score, false); - swap_if_needed(my_score[0], score); - score = global_output[threadIdx.x + 64]; - warp_sort(score, false); - swap_if_needed(my_score[1], score); - score = global_output[threadIdx.x + 32]; - warp_sort(score, false); - swap_if_needed(my_score[2], score); - score = global_output[threadIdx.x]; - warp_sort(score, false); - swap_if_needed(my_score[3], score); - - swap_if_needed(my_score[0], my_score[2]); - swap_if_needed(my_score[1], my_score[3]); - swap_if_needed(my_score[2], my_score[3]); - warp_merge(my_score[3]); - warp_merge(my_score[2]); - swap_if_needed(my_score[0], my_score[1]); - warp_merge(my_score[1]); - warp_merge(my_score[0]); - - atomicMin(global_output + threadIdx.x, my_score[0]); - atomicMin(global_output + threadIdx.x + 32, my_score[1]); - atomicMin(global_output + threadIdx.x + 64, my_score[2]); - if (threadIdx.x + 96 < topk) { atomicMin(global_output + threadIdx.x + 96, my_score[3]); } - } -} - // template __device__ inline outDtype get_out_score(float score, distance::DistanceType metric) @@ -5024,7 +4933,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( const float* _query, // [sizeBatch, dimDataset,] const uint32_t* indexList, // [sizeBatch * numProbes] float* _preCompScores, // [...] - float* _topkScores, // [sizeBatch, topk] + float* _topkScores, // [sizeBatch] outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] uint32_t* _topkIndex // [sizeBatch, numProbes, topk] ) @@ -5061,7 +4970,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( // Store topk calculated distances to output (and its indices to topkIndex) output = _output + (topk * (iProbe + (numProbes * iBatch))); topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + (topk * iBatch); + approx_global_score = _topkScores + iBatch; } else { // Store all calculated distances to output output = _output + (maxSamples * iBatch); @@ -5116,8 +5025,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( // topk::block_sort // block_topk(topk, ___); - BlockTopk block_topk( - topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + BlockTopk block_topk(topk, approx_global_score); __syncthreads(); // Compute a distance for each sample @@ -5150,12 +5058,8 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( } // Approximate update of global topk entries - if (warp_id == 0) { - float my_score[depth]; - for (int j = 0; j < depth; j++) { - my_score[j] = block_topk.key(j); - } - update_approx_global_score(topk, my_score, approx_global_score); + if (threadIdx.x == (topk - 1) % 32) { + atomicMin(approx_global_score, block_topk.key((topk - 1) / 32)); } } @@ -5181,7 +5085,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( const float* _query, // [sizeBatch, dimDataset,] const uint32_t* indexList, // [sizeBatch * numProbes] float* _preCompScores, // [..., dimPq << bitPq,] - float* _topkScores, // [sizeBatch, topk] + float* _topkScores, // [sizeBatch] outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] uint32_t* _topkIndex // [sizeBatch, numProbes, topk] ) @@ -5217,7 +5121,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( // Store topk calculated distances to output (and its indices to topkIndex) output = _output + (topk * (iProbe + (numProbes * iBatch))); topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + (topk * iBatch); + approx_global_score = _topkScores + iBatch; } else { // Store all calculated distances to output output = _output + (maxSamples * iBatch); @@ -5270,8 +5174,7 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } uint32_t iDatasetBase = clusterIndexPtr[label]; - BlockTopk block_topk( - topk, manageLocalTopk ? approx_global_score + topk - 1 : NULL); + BlockTopk block_topk(topk, approx_global_score); __syncthreads(); // Compute a distance for each sample @@ -5306,12 +5209,8 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( } // Approximate update of global topk entries - if (warp_id == 0) { - float my_score[depth]; - for (int j = 0; j < depth; j++) { - my_score[j] = block_topk.key(j); - } - update_approx_global_score(topk, my_score, approx_global_score); + if (threadIdx.x == (topk - 1) % 32) { + atomicMin(approx_global_score, block_topk.key((topk - 1) / 32)); } __syncthreads(); } From cb70c79e189bd234953bc54683619ee79f208c30 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 18 Aug 2022 11:59:03 +0200 Subject: [PATCH 024/140] Use raft topk code --- .../raft/spatial/knn/detail/ivf_pq_legacy.cuh | 540 ++---------------- .../spatial/knn/detail/topk/warpsort_topk.cuh | 2 +- 2 files changed, 53 insertions(+), 489 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh index 6ccbc089fb..826e0dee2e 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_legacy.cuh @@ -2222,23 +2222,13 @@ __global__ void ivfpq_make_outputs(uint32_t numProbes, // inline bool manage_local_topk(cuannIvfPqDescriptor_t& desc) { - int depth = (desc->topK + 31) / 32; + int depth = raft::ceildiv(desc->topK, 32); if (depth > 4) { return false; } if (desc->numProbes < 16) { return false; } if (desc->maxBatchSize * desc->numProbes < 256) { return false; } return true; } -// -inline size_t get_sizeSmemForLocalTopk(cuannIvfPqDescriptor_t& desc, int numThreads) -{ - if (manage_local_topk(desc)) { - int topk_32 = (desc->topK + 31) / 32; - return (sizeof(float) + sizeof(uint32_t)) * (numThreads / 2) * topk_32; - } - return 0; -} - // return workspace size inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescriptor_t& desc) { @@ -2287,11 +2277,6 @@ inline size_t ivfpq_search_bufferSize(const handle_t& handle, cuannIvfPqDescript size += Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK); } - // topkScores - if (manage_local_topk(desc)) { - // [maxBatchSize, topk] - size += Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK); - } // preCompScores [multiProcessorCount, dimPq, 1 << bitPq,] size += Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq)); @@ -4490,10 +4475,9 @@ template __device__ inline float ivfpq_compute_score( uint32_t dimPq, uint32_t iDataset, - const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] - const smemLutDtype* preCompScores, // [dimPq, 1 << bitPq] - bool earlyStop, - float kth_score = FLT_MAX) + const uint8_t* pqDataset, // [numDataset, dimPq * bitPq / 8] + const smemLutDtype* preCompScores // [dimPq, 1 << bitPq] +) { float score = 0.0; constexpr uint32_t bitT = sizeof(T) * 8; @@ -4521,386 +4505,11 @@ __device__ inline float ivfpq_compute_score( code &= (1 << bitPq) - 1; score += (float)preCompScores[code]; preCompScores += (1 << bitPq); - - if (earlyStop && (vecLen > 8) && ((k % 8) == 0)) { - if (score > kth_score) { return FLT_MAX; } - } - } - if (earlyStop && (vecLen <= 8)) { - if (score > kth_score) { return FLT_MAX; } } } return score; } -// -template -__device__ inline void warp_merge(K& key, bool acending = true, int group_size = 32) -{ - int lane_id = threadIdx.x % 32; - for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { - bool direction = ((lane_id & mask) == 0); - K opp_key = __shfl_xor_sync(0xffffffff, key, mask); - if ((acending == direction) == (key > opp_key)) { key = opp_key; } - } -} - -// -template -__device__ inline void warp_merge(K& key, V& val, bool acending = true, int group_size = 32) -{ - int lane_id = threadIdx.x % 32; - for (int mask = (group_size >> 1); mask > 0; mask >>= 1) { - bool direction = ((lane_id & mask) == 0); - K opp_key = __shfl_xor_sync(0xffffffff, key, mask); - V opp_val = __shfl_xor_sync(0xffffffff, val, mask); - if ((acending == direction) == ((key > opp_key) || ((key == opp_key) && (val > opp_val)))) { - key = opp_key; - val = opp_val; - } - } -} - -// -template -__device__ inline void warp_sort(K& key, bool acending = true) -{ - int lane_id = threadIdx.x % 32; - for (int group_size = 2; group_size <= 32; group_size <<= 1) { - bool direction = ((lane_id & group_size) == 0); - if ((group_size == 32) && (!acending)) { direction = !direction; } - warp_merge(key, direction, group_size); - } -} - -// -template -__device__ inline void warp_sort(K& key, V& val, bool acending = true) -{ - int lane_id = threadIdx.x % 32; - for (int group_size = 2; group_size <= 32; group_size <<= 1) { - bool direction = ((lane_id & group_size) == 0); - if ((group_size == 32) && (!acending)) { direction = !direction; } - warp_merge(key, val, direction, group_size); - } -} - -// -template -__device__ inline void swap_vals(T& val1, T& val2) -{ - T val0 = val1; - val1 = val2; - val2 = val0; -} - -// -template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) -{ - if ((key1 > key2) || ((key1 == key2) && (val1 > val2))) { - swap_vals(key1, key2); - swap_vals(val1, val2); - return true; - } - return false; -} - -// -template -__device__ inline bool swap_if_needed(K& key1, K& key2) -{ - if (key1 > key2) { - swap_vals(key1, key2); - return true; - } - return false; -} - -// -template -__device__ inline T max_value_of(); -template <> -__device__ inline float max_value_of() -{ - return FLT_MAX; -} -template <> -__device__ inline uint32_t max_value_of() -{ - return ~0u; -} - -// depth == kMaxArrLen = Capacity / kWarpWidth; (kWarpWidth fixed to 32) -template -class BlockTopk { - public: - /** - * @param topk - k, must be not greater than depth * 32 - * @param ptr_kth_key - initial value for k-th element; it's never updated here, - * seems to be needed only for optimization. It's read multiple times though. Is it updated - * elsewhere? Perhaps, only when other blocks in the grid finish their execution. - * Also, this parameter is optional; when null, the value is simply not used. - * - */ - __device__ BlockTopk(uint32_t topk, K* ptr_kth_key) : _topk(topk), _lane_id(threadIdx.x % 32) - { -#pragma unroll - for (int i = 0; i < depth; i++) { - _key[i] = max_value_of(); - _val[i] = max_value_of(); - } - _nfill = 0; - _init_buf(); - _ptr_kth_key = ptr_kth_key; - if (_ptr_kth_key) { - _kth_key = _ptr_kth_key[0]; - } else { - _kth_key = max_value_of(); - } - // __syncthreads(); - } - - __device__ inline K key(int i) { return _key[i]; } - - __device__ inline V val(int i) { return _val[i]; } - - __device__ inline K kth_key() { return _kth_key; } - - __device__ void add(K key, V val) - { - uint32_t mask = __ballot_sync(0xffffffff, (key < _kth_key)); - if (mask == 0) { return; } - uint32_t nvalid = __popc(mask); - if (_buf_nvalid + nvalid > 32) { - _add(_buf_key, _buf_val); - _init_buf(); - if (_ptr_kth_key) { _kth_key = min(_kth_key, _ptr_kth_key[0]); } - } - _push_buf(key, val, mask, nvalid); - } - - __device__ void finalize() - { - if (_buf_nvalid > 0) { _add(_buf_key, _buf_val); } - _merge(); - } - - protected: - K _key[depth]; - V _val[depth]; - K* _ptr_kth_key; - K _kth_key; - uint32_t _nfill; // 0 <= _nfill <= depth - K _buf_key; - V _buf_val; - uint32_t _buf_nvalid; // 0 <= _buf_nvalid <= 32 - - const uint32_t _topk; - const uint32_t _lane_id; - - __device__ inline void _init_buf() - { - _buf_nvalid = 0; - _buf_key = max_value_of(); - _buf_val = max_value_of(); - } - - __device__ inline void _adjust_nfill() - { -#pragma unroll - for (int j = 1; j < depth; j++) { - if (_nfill == depth - j + 1) { - if (__shfl_sync(0xffffffff, _key[depth - j], 0) <= _kth_key) { return; } - _nfill = depth - j; - } - } - } - - __device__ inline void _push_buf(K key, V val, uint32_t mask, uint32_t nvalid) - { - int i = 0; - if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { - int j = _lane_id - _buf_nvalid; - while (j > 0) { - i = __ffs(mask) - 1; - mask ^= (0x1u << i); - j -= 1; - } - i = __ffs(mask) - 1; - } - K temp_key = __shfl_sync(0xffffffff, key, i); - K temp_val = __shfl_sync(0xffffffff, val, i); - if ((_buf_nvalid <= _lane_id) && (_lane_id < _buf_nvalid + nvalid)) { - _buf_key = temp_key; - _buf_val = temp_val; - } - _buf_nvalid += nvalid; - } - - __device__ inline void _add(K key, V val) - { - if (_nfill == 0) { - warp_sort(key, val); - _key[0] = key; - _val[0] = val; - } else if (_nfill == 1) { - warp_sort(key, val, false); - swap_if_needed(_key[0], key, _val[0], val); - if (depth > 1) { - _key[1] = key; - _val[1] = val; - warp_merge(_key[1], _val[1]); - } - warp_merge(_key[0], _val[0]); - } else if ((depth >= 2) && (_nfill == 2)) { - warp_sort(key, val, false); - swap_if_needed(_key[1], key, _val[1], val); - if (depth > 2) { - _key[2] = key; - _val[2] = val; - warp_merge(_key[2], _val[2]); - } - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if ((depth >= 3) && (_nfill == 3)) { - warp_sort(key, val, false); - swap_if_needed(_key[2], key, _val[2], val); - if (depth > 3) { - _key[3] = key; - _val[3] = val; - warp_merge(_key[3], _val[3]); - } - warp_merge(_key[2], _val[2], false); - swap_if_needed(_key[1], _key[2], _val[1], _val[2]); - warp_merge(_key[2], _val[2]); - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if ((depth >= 4) && (_nfill == 4)) { - warp_sort(key, val, false); - swap_if_needed(_key[3], key, _val[3], val); - warp_merge(_key[3], _val[3], false); - swap_if_needed(_key[2], _key[3], _val[2], _val[3]); - warp_merge(_key[3], _val[3]); - warp_merge(_key[2], _val[2], false); - swap_if_needed(_key[1], _key[2], _val[1], _val[2]); - warp_merge(_key[2], _val[2]); - warp_merge(_key[1], _val[1], false); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } - _nfill = min(_nfill + 1, depth); - if (_nfill == depth) { - _kth_key = - min(_kth_key, __shfl_sync(0xffffffff, _key[depth - 1], _topk - 1 - (depth - 1) * 32)); - } - } - - __device__ inline void _merge() - { - uint32_t warp_id = threadIdx.x / 32; - uint32_t num_warps = blockDim.x / 32; - K* smem_key = smemArray; - V* smem_val = (V*)(smem_key + (blockDim.x / 2) * depth); - for (int j = num_warps / 2; j > 0; j /= 2) { - __syncthreads(); - if ((j <= warp_id) && (warp_id < (j * 2))) { - uint32_t opp_tid = threadIdx.x - (j * 32); - smem_key[opp_tid] = _key[0]; - smem_val[opp_tid] = _val[0]; - if (depth >= 2) { - smem_key[opp_tid + (j * 32)] = _key[1]; - smem_val[opp_tid + (j * 32)] = _val[1]; - } - if (depth >= 3) { - smem_key[opp_tid + (j * 32) * 2] = _key[2]; - smem_val[opp_tid + (j * 32) * 2] = _val[2]; - } - if (depth >= 4) { - smem_key[opp_tid + (j * 32) * 3] = _key[3]; - smem_val[opp_tid + (j * 32) * 3] = _val[3]; - } - } - __syncthreads(); - if (warp_id < j) { - K key; - V val; - if (depth == 1) { - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[0], key, _val[0], val); - - warp_merge(_key[0], _val[0]); - } else if (depth == 2) { - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[0], key, _val[0], val); - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[1], key, _val[1], val); - - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if (depth == 3) { - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; - swap_if_needed(_key[1], key, _val[1], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[2], key, _val[2], val); - K _key_3_ = smem_key[threadIdx.x ^ 31]; - V _val_3_ = smem_val[threadIdx.x ^ 31]; - - swap_if_needed(_key[0], _key[2], _val[0], _val[2]); - swap_if_needed(_key[1], _key_3_, _val[1], _val_3_); - swap_if_needed(_key[2], _key_3_, _val[2], _val_3_); - warp_merge(_key[2], _val[2]); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } else if (depth == 4) { - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 3]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 3]; - swap_if_needed(_key[0], key, _val[0], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32) * 2]; - val = smem_val[threadIdx.x ^ 31 + (j * 32) * 2]; - swap_if_needed(_key[1], key, _val[1], val); - key = smem_key[threadIdx.x ^ 31 + (j * 32)]; - val = smem_val[threadIdx.x ^ 31 + (j * 32)]; - swap_if_needed(_key[2], key, _val[2], val); - key = smem_key[threadIdx.x ^ 31]; - val = smem_val[threadIdx.x ^ 31]; - swap_if_needed(_key[3], key, _val[3], val); - - swap_if_needed(_key[0], _key[2], _val[0], _val[2]); - swap_if_needed(_key[1], _key[3], _val[1], _val[3]); - swap_if_needed(_key[2], _key[3], _val[2], _val[3]); - warp_merge(_key[3], _val[3]); - warp_merge(_key[2], _val[2]); - swap_if_needed(_key[0], _key[1], _val[0], _val[1]); - warp_merge(_key[1], _val[1]); - warp_merge(_key[0], _val[0]); - } - } - } - } -}; - -// -template -__device__ inline outDtype get_out_score(float score, distance::DistanceType metric) -{ - if (metric == distance::DistanceType::InnerProduct) { score = score / 2.0 - 1.0; } - if (sizeof(outDtype) == 2) { score = min(score, FP16_MAX); } - return (outDtype)score; -} - // // (*) Restrict the peak GPU occupancy up-to 50% by "__launch_bounds__(1024, 1)", // as there were cases where performance dropped by a factor of two or more on V100 @@ -4933,15 +4542,13 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( const float* _query, // [sizeBatch, dimDataset,] const uint32_t* indexList, // [sizeBatch * numProbes] float* _preCompScores, // [...] - float* _topkScores, // [sizeBatch] outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] uint32_t* _topkIndex // [sizeBatch, numProbes, topk] ) { const uint32_t lenPq = dimDataset / dimPq; - float* smem = smemArray; - smemLutDtype* preCompScores = (smemLutDtype*)smem; + smemLutDtype* preCompScores = (smemLutDtype*)smemArray; float* baseDiff = NULL; if (preCompBaseDiff) { baseDiff = (float*)(preCompScores + (dimPq << bitPq)); } bool manageLocalTopk = false; @@ -4964,13 +4571,11 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); const float* query = _query + (dimDataset * iBatch); outDtype* output; - uint32_t* topkIndex = NULL; - float* approx_global_score = NULL; + uint32_t* topkIndex = NULL; if (manageLocalTopk) { // Store topk calculated distances to output (and its indices to topkIndex) - output = _output + (topk * (iProbe + (numProbes * iBatch))); - topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + iBatch; + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); } else { // Store all calculated distances to output output = _output + (maxSamples * iBatch); @@ -5023,44 +4628,29 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity( if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } uint32_t iDatasetBase = clusterIndexPtr[label]; - // topk::block_sort - // block_topk(topk, ___); - BlockTopk block_topk(topk, approx_global_score); - __syncthreads(); + using block_sort_t = + topk::block_sort; + block_sort_t block_topk(topk, reinterpret_cast(smemArray)); + const outDtype limit = block_sort_t::queue_t::kDummy; // Compute a distance for each sample for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { - float score = FLT_MAX; + float score = limit; if (i < nSamples) { score = ivfpq_compute_score( - dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + dimPq, i + iDatasetBase, pqDataset, preCompScores); } if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } + if (i < nSamples) { output[i + iSampleBase] = score; } } else { - uint32_t val = i; - block_topk.add(score, val); + block_topk.add(score, iDatasetBase + i); } } if (!manageLocalTopk) { return; } - block_topk.finalize(); - // block_topk.done(); - - // Output topk score and index - uint32_t warp_id = threadIdx.x / 32; - if (warp_id == 0) { - for (int j = 0; j < depth; j++) { - if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); - topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; - } - } - } - - // Approximate update of global topk entries - if (threadIdx.x == (topk - 1) % 32) { - atomicMin(approx_global_score, block_topk.key((topk - 1) / 32)); - } + // sync threads before the topk merging operation, because we reuse the shared memory + __syncthreads(); + block_topk.done(); + block_topk.store(output, topkIndex); } // @@ -5085,7 +4675,6 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( const float* _query, // [sizeBatch, dimDataset,] const uint32_t* indexList, // [sizeBatch * numProbes] float* _preCompScores, // [..., dimPq << bitPq,] - float* _topkScores, // [sizeBatch] outDtype* _output, // [sizeBatch, maxSamples,] or [sizeBatch, numProbes, topk] uint32_t* _topkIndex // [sizeBatch, numProbes, topk] ) @@ -5115,13 +4704,11 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( const uint32_t* chunkIndexPtr = _chunkIndexPtr + (numProbes * iBatch); const float* query = _query + (dimDataset * iBatch); outDtype* output; - uint32_t* topkIndex = NULL; - float* approx_global_score = NULL; + uint32_t* topkIndex = NULL; if (manageLocalTopk) { // Store topk calculated distances to output (and its indices to topkIndex) - output = _output + (topk * (iProbe + (numProbes * iBatch))); - topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); - approx_global_score = _topkScores + iBatch; + output = _output + (topk * (iProbe + (numProbes * iBatch))); + topkIndex = _topkIndex + (topk * (iProbe + (numProbes * iBatch))); } else { // Store all calculated distances to output output = _output + (maxSamples * iBatch); @@ -5174,44 +4761,30 @@ __launch_bounds__(1024, 1) __global__ void ivfpq_compute_similarity_no_smem_lut( if (nSamples32 % 32 > 0) { nSamples32 = nSamples32 + (32 - (nSamples % 32)); } uint32_t iDatasetBase = clusterIndexPtr[label]; - BlockTopk block_topk(topk, approx_global_score); - __syncthreads(); + using block_sort_t = + topk::block_sort; + block_sort_t block_topk(topk, reinterpret_cast(smemArray)); + const outDtype limit = block_sort_t::queue_t::kDummy; // Compute a distance for each sample for (uint32_t i = threadIdx.x; i < nSamples32; i += blockDim.x) { - float score = FLT_MAX; + float score = limit; if (i < nSamples) { - score = ivfpq_compute_score( - dimPq, i + iDatasetBase, pqDataset, preCompScores, manageLocalTopk, block_topk.kth_key()); + score = + ivfpq_compute_score(dimPq, i + iDatasetBase, pqDataset, preCompScores); } if (!manageLocalTopk) { - if (i < nSamples) { output[i + iSampleBase] = get_out_score(score, metric); } + if (i < nSamples) { output[i + iSampleBase] = score; } } else { - uint32_t val = i; - block_topk.add(score, val); + block_topk.add(score, iDatasetBase + i); } } __syncthreads(); if (!manageLocalTopk) { continue; // for (int ib ...) } - block_topk.finalize(); - - // Output topk score and index - uint32_t warp_id = threadIdx.x / 32; - if (warp_id == 0) { - for (int j = 0; j < depth; j++) { - if (threadIdx.x + (32 * j) < topk) { - output[threadIdx.x + (32 * j)] = get_out_score(block_topk.key(j), metric); - topkIndex[threadIdx.x + (32 * j)] = block_topk.val(j) + iDatasetBase; - } - } - } - - // Approximate update of global topk entries - if (threadIdx.x == (topk - 1) % 32) { - atomicMin(approx_global_score, block_topk.key((topk - 1) / 32)); - } + block_topk.done(); + block_topk.store(output, topkIndex); __syncthreads(); } } @@ -5248,7 +4821,6 @@ inline void ivfpq_search(const handle_t& handle, // [maxBatchSize, numProbes, topk] uint32_t* simTopkIndex; // [maxBatchSize, numProbes, topk] /* Preset with the dummy value and only accessed within the main kernel. */ - float* topkScores; // [maxBatchSize, topk] float* preCompScores = NULL; void* topkWorkspace; @@ -5272,16 +4844,13 @@ inline void ivfpq_search(const handle_t& handle, (scoreDtype*)((uint8_t*)topkSids + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->topK)); if (manage_local_topk(desc)) { - topkScores = - (float*)((uint8_t*)similarity + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * - desc->numProbes * desc->topK)); - simTopkIndex = (uint32_t*)((uint8_t*)topkScores + - Pow2<128>::roundUp(sizeof(float) * desc->maxBatchSize * desc->topK)); + simTopkIndex = (uint32_t*)((uint8_t*)similarity + + Pow2<128>::roundUp(sizeof(scoreDtype) * desc->maxBatchSize * + desc->numProbes * desc->topK)); preCompScores = (float*)((uint8_t*)simTopkIndex + Pow2<128>::roundUp(sizeof(uint32_t) * desc->maxBatchSize * desc->numProbes * desc->topK)); } else { - topkScores = NULL; simTopkIndex = NULL; preCompScores = (float*)((uint8_t*)similarity + @@ -5291,14 +4860,6 @@ inline void ivfpq_search(const handle_t& handle, (void*)((uint8_t*)preCompScores + Pow2<128>::roundUp(sizeof(float) * getMultiProcessorCount() * desc->dimPq * (1 << desc->bitPq))); - // - if (manage_local_topk(desc)) { - thrust::fill_n(handle.get_thrust_policy(), - thrust::device_pointer_cast(topkScores), - numQueries * desc->topK, - FLT_MAX); - } - // dim3 mcThreads(1024, 1, 1); // DO NOT CHANGE dim3 mcBlocks(numQueries, 1, 1); @@ -5388,18 +4949,20 @@ inline void ivfpq_search(const handle_t& handle, const float*, const uint32_t*, float*, - float*, scoreDtype*, uint32_t*); kernel_t kernel_no_basediff; kernel_t kernel_fast; kernel_t kernel_no_smem_lut; - int depth = 1; - if (manage_local_topk(desc)) { depth = (desc->topK + 31) / 32; } + uint32_t depth = 1; + if (manage_local_topk(desc)) { + while (depth * WarpSize < desc->topK) { + depth *= 2; + } + } switch (depth) { case 1: SET_KERNEL3(1); break; case 2: SET_KERNEL3(2); break; - case 3: SET_KERNEL3(3); break; case 4: SET_KERNEL3(4); break; default: RAFT_FAIL("ivf_pq::search(k = %u): depth value is too big (%d)", desc->topK, depth); } @@ -5429,8 +4992,9 @@ inline void ivfpq_search(const handle_t& handle, } else { numThreads = desc->preferredThreadBlockSize; } - size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); - sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); + size_t sizeSmemForLocalTopk = topk::template calc_smem_size_for_block_wide( + numThreads / WarpSize, desc->topK); + sizeSmem = max(sizeSmem, sizeSmemForLocalTopk); kernel_t kernel = kernel_no_basediff; @@ -5445,11 +5009,12 @@ inline void ivfpq_search(const handle_t& handle, kernel_no_basediff_available = false; // Use "kernel_no_smem_lut" which just uses small amount of shared memory. - kernel = kernel_no_smem_lut; - numThreads = 1024; - size_t sizeSmemForLocalTopk = get_sizeSmemForLocalTopk(desc, numThreads); - sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); - numCTAs = getMultiProcessorCount(); + kernel = kernel_no_smem_lut; + numThreads = 1024; + size_t sizeSmemForLocalTopk = + topk::calc_smem_size_for_block_wide(numThreads / WarpSize, desc->topK); + sizeSmem = max(sizeSmemBaseDiff, sizeSmemForLocalTopk); + numCTAs = getMultiProcessorCount(); } } if (kernel_no_basediff_available) { @@ -5500,7 +5065,6 @@ inline void ivfpq_search(const handle_t& handle, query, indexListSorted, preCompScores, - topkScores, (scoreDtype*)similarity, simTopkIndex); #if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh index 017678afbb..4acaf04704 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh +++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh @@ -429,9 +429,9 @@ template