diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu index 72a1244269..64a1217d7f 100644 --- a/cpp/bench/spatial/knn.cu +++ b/cpp/bench/spatial/knn.cu @@ -17,7 +17,8 @@ #include #include -#include + +#include #if defined RAFT_NN_COMPILED #include #endif @@ -126,6 +127,34 @@ struct host_uvector { T* arr_; }; +template +struct ivf_flat_knn { + using dist_t = float; + + std::optional> index; + raft::spatial::knn::ivf_flat::index_params index_params; + raft::spatial::knn::ivf_flat::search_params search_params; + params ps; + + ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + index.emplace(raft::spatial::knn::ivf_flat::build( + handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims))); + } + + void search(const raft::handle_t& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + raft::spatial::knn::ivf_flat::search( + handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists); + } +}; + template struct brute_force_knn { using dist_t = ValT; @@ -326,7 +355,13 @@ const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::B } KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); +KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); +KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); +KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull); +KNN_REGISTER(float, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); +KNN_REGISTER(int8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); +KNN_REGISTER(uint8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); } // namespace raft::bench::spatial diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index 41dc9cab08..0c4750aa69 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,121 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ +DI void sts(uint8_t* addr, const uint8_t& x) +{ + uint32_t x_int; + x_int = x; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int)); +} +DI void sts(uint8_t* addr, const uint8_t (&x)[1]) +{ + uint32_t x_int[1]; + x_int[0] = x[0]; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int[0])); +} +DI void sts(uint8_t* addr, const uint8_t (&x)[2]) +{ + uint32_t x_int[2]; + x_int[0] = x[0]; + x_int[1] = x[1]; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.u8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1])); +} +DI void sts(uint8_t* addr, const uint8_t (&x)[4]) +{ + uint32_t x_int[4]; + x_int[0] = x[0]; + x_int[1] = x[1]; + x_int[2] = x[2]; + x_int[3] = x[3]; + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.u8 [%0], {%1, %2, %3, %4};" + : + : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3])); +} + +DI void sts(int8_t* addr, const int8_t& x) +{ + int32_t x_int = x; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int)); +} +DI void sts(int8_t* addr, const int8_t (&x)[1]) +{ + int32_t x_int[1]; + x_int[0] = x[0]; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int[0])); +} +DI void sts(int8_t* addr, const int8_t (&x)[2]) +{ + int32_t x_int[2]; + x_int[0] = x[0]; + x_int[1] = x[1]; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.s8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1])); +} +DI void sts(int8_t* addr, const int8_t (&x)[4]) +{ + int32_t x_int[4]; + x_int[0] = x[0]; + x_int[1] = x[1]; + x_int[2] = x[2]; + x_int[3] = x[3]; + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.s8 [%0], {%1, %2, %3, %4};" + : + : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3])); +} + +DI void sts(uint32_t* addr, const uint32_t& x) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x)); +} +DI void sts(uint32_t* addr, const uint32_t (&x)[1]) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0])); +} +DI void sts(uint32_t* addr, const uint32_t (&x)[2]) +{ + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1])); +} +DI void sts(uint32_t* addr, const uint32_t (&x)[4]) +{ + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" + : + : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3])); +} + +DI void sts(int32_t* addr, const int32_t& x) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x)); +} +DI void sts(int32_t* addr, const int32_t (&x)[1]) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0])); +} +DI void sts(int32_t* addr, const int32_t (&x)[2]) +{ + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1])); +} +DI void sts(int32_t* addr, const int32_t (&x)[4]) +{ + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" + : + : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3])); +} + DI void sts(float* addr, const float& x) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); @@ -83,6 +198,152 @@ DI void sts(double* addr, const double (&x)[2]) * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ + +DI void lds(uint8_t& x, const uint8_t* addr) +{ + uint32_t x_int; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int) : "l"(s1)); + x = x_int; +} +DI void lds(uint8_t (&x)[1], const uint8_t* addr) +{ + uint32_t x_int[1]; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1)); + x[0] = x_int[0]; +} +DI void lds(uint8_t (&x)[2], const uint8_t* addr) +{ + uint32_t x_int[2]; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2)); + x[0] = x_int[0]; + x[1] = x_int[1]; +} +DI void lds(uint8_t (&x)[4], const uint8_t* addr) +{ + uint32_t x_int[4]; + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.u8 {%0, %1, %2, %3}, [%4];" + : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) + : "l"(s4)); + x[0] = x_int[0]; + x[1] = x_int[1]; + x[2] = x_int[2]; + x[3] = x_int[3]; +} + +DI void lds(int8_t& x, const int8_t* addr) +{ + int32_t x_int; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int) : "l"(s1)); + x = x_int; +} +DI void lds(int8_t (&x)[1], const int8_t* addr) +{ + int32_t x_int[1]; + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1)); + x[0] = x_int[0]; +} +DI void lds(int8_t (&x)[2], const int8_t* addr) +{ + int32_t x_int[2]; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2)); + x[0] = x_int[0]; + x[1] = x_int[1]; +} +DI void lds(int8_t (&x)[4], const int8_t* addr) +{ + int32_t x_int[4]; + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.s8 {%0, %1, %2, %3}, [%4];" + : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) + : "l"(s4)); + x[0] = x_int[0]; + x[1] = x_int[1]; + x[2] = x_int[2]; + x[3] = x_int[3]; +} + +DI void lds(uint32_t (&x)[4], const uint32_t* addr) +{ + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) + : "l"(s4)); +} + +DI void lds(uint32_t (&x)[2], const uint32_t* addr) +{ + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2)); +} + +DI void lds(uint32_t (&x)[1], const uint32_t* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1)); +} + +DI void lds(uint32_t& x, const uint32_t* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1)); +} + +DI void lds(int32_t (&x)[4], const int32_t* addr) +{ + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) + : "l"(s4)); +} + +DI void lds(int32_t (&x)[2], const int32_t* addr) +{ + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2)); +} + +DI void lds(int32_t (&x)[1], const int32_t* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1)); +} + +DI void lds(int32_t& x, const int32_t* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1)); +} + +DI void lds(float& x, const float* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); +} +DI void lds(float (&x)[1], const float* addr) +{ + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); +} +DI void lds(float (&x)[2], const float* addr) +{ + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); +} +DI void lds(float (&x)[4], const float* addr) +{ + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" + : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) + : "l"(s4)); +} + DI void lds(float& x, float* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); @@ -159,6 +420,119 @@ DI void ldg(double (&x)[2], const double* addr) { asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); } + +DI void ldg(uint32_t (&x)[4], const uint32_t* const& addr) +{ + asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) + : "l"(addr)); +} + +DI void ldg(uint32_t (&x)[2], const uint32_t* const& addr) +{ + asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr)); +} + +DI void ldg(uint32_t (&x)[1], const uint32_t* const& addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr)); +} + +DI void ldg(uint32_t& x, const uint32_t* const& addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +DI void ldg(int32_t (&x)[4], const int32_t* const& addr) +{ + asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) + : "l"(addr)); +} + +DI void ldg(int32_t (&x)[2], const int32_t* const& addr) +{ + asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr)); +} + +DI void ldg(int32_t (&x)[1], const int32_t* const& addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr)); +} + +DI void ldg(int32_t& x, const int32_t* const& addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +DI void ldg(uint8_t (&x)[4], const uint8_t* const& addr) +{ + uint32_t x_int[4]; + asm volatile("ld.global.cg.v4.u8 {%0, %1, %2, %3}, [%4];" + : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) + : "l"(addr)); + x[0] = x_int[0]; + x[1] = x_int[1]; + x[2] = x_int[2]; + x[3] = x_int[3]; +} + +DI void ldg(uint8_t (&x)[2], const uint8_t* const& addr) +{ + uint32_t x_int[2]; + asm volatile("ld.global.cg.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr)); + x[0] = x_int[0]; + x[1] = x_int[1]; +} + +DI void ldg(uint8_t (&x)[1], const uint8_t* const& addr) +{ + uint32_t x_int; + asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr)); + x[0] = x_int; +} + +DI void ldg(uint8_t& x, const uint8_t* const& addr) +{ + uint32_t x_int; + asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr)); + x = x_int; +} + +DI void ldg(int8_t (&x)[4], const int8_t* const& addr) +{ + int x_int[4]; + asm volatile("ld.global.cg.v4.s8 {%0, %1, %2, %3}, [%4];" + : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) + : "l"(addr)); + x[0] = x_int[0]; + x[1] = x_int[1]; + x[2] = x_int[2]; + x[3] = x_int[3]; +} + +DI void ldg(int8_t (&x)[2], const int8_t* const& addr) +{ + int x_int[2]; + asm volatile("ld.global.cg.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr)); + x[0] = x_int[0]; + x[1] = x_int[1]; +} + +DI void ldg(int8_t& x, const int8_t* const& addr) +{ + int x_int; + asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr)); + x = x_int; +} + +DI void ldg(int8_t (&x)[1], const int8_t* const& addr) +{ + int x_int; + asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr)); + x[0] = x_int; +} + /** @} */ } // namespace raft diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp index 7fe6ecaf6b..e0957ea1f3 100644 --- a/cpp/include/raft/core/cudart_utils.hpp +++ b/cpp/include/raft/core/cudart_utils.hpp @@ -26,7 +26,9 @@ #include #include +#include #include +#include #include @@ -445,6 +447,53 @@ constexpr T upper_bound() return std::numeric_limits::max(); } +/** + * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned + * unique pointer. + * + * This function is useful in the code where multiple repeated allocations/deallocations are + * expected. + * Use case example: + * @code{.cpp} + * void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) { + * auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float)); + * if (pool_guard){ + * RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size()); + * } else { + * RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource"); + * } + * rmm::device_uvector x(n, stream, mr); + * rmm::device_uvector y(n, stream, mr); + * ... + * } + * @endcode + * Here, the new memory resource would be created within the function scope if the passed `mr` is + * null and the default resource is not a pool. After the call, `mr` contains a valid memory + * resource in any case. + * + * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it + * into a `pool_memory_resource` if neccessary and return the pointer to the result. + * @param initial_size if a new memory pool is created, this would be its initial size (rounded up + * to 256 bytes). + * + * @return if a new memory pool is created, it returns a unique_ptr to it; + * this managed pointer controls the lifetime of the created memory resource. + */ +inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size) +{ + using pool_res_t = rmm::mr::pool_memory_resource; + std::unique_ptr pool_res{}; + if (mr) return pool_res; + mr = rmm::mr::get_current_device_resource(); + if (!dynamic_cast(mr) && + !dynamic_cast*>(mr) && + !dynamic_cast*>(mr)) { + pool_res = std::make_unique(mr, (initial_size + 255) & (~255)); + mr = pool_res.get(); + } + return pool_res; +} + } // namespace raft #endif diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 362dba66c5..19800cb2d9 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -649,6 +649,67 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff #endif } +/** + * @brief Four-way byte dot product-accumulate. + * @tparam T Four-byte integer: int or unsigned int + * @tparam S Either same as T or a 4-byte vector of the same signedness. + * + * @param a + * @param b + * @param c + * @return dot(a, b) + c + */ +template +DI auto dp4a(S a, S b, T c) -> T; + +template <> +DI auto dp4a(char4 a, char4 b, int c) -> int +{ +#if __CUDA_ARCH__ >= 610 + return __dp4a(a, b, c); +#else + c += static_cast(a.x) * static_cast(b.x); + c += static_cast(a.y) * static_cast(b.y); + c += static_cast(a.z) * static_cast(b.z); + c += static_cast(a.w) * static_cast(b.w); + return c; +#endif +} + +template <> +DI auto dp4a(uchar4 a, uchar4 b, unsigned int c) -> unsigned int +{ +#if __CUDA_ARCH__ >= 610 + return __dp4a(a, b, c); +#else + c += static_cast(a.x) * static_cast(b.x); + c += static_cast(a.y) * static_cast(b.y); + c += static_cast(a.z) * static_cast(b.z); + c += static_cast(a.w) * static_cast(b.w); + return c; +#endif +} + +template <> +DI auto dp4a(int a, int b, int c) -> int +{ +#if __CUDA_ARCH__ >= 610 + return __dp4a(a, b, c); +#else + return dp4a(*reinterpret_cast(&a), *reinterpret_cast(&b), c); +#endif +} + +template <> +DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int +{ +#if __CUDA_ARCH__ >= 610 + return __dp4a(a, b, c); +#else + return dp4a(*reinterpret_cast(&a), *reinterpret_cast(&b), c); +#endif +} + /** * @brief Warp-level sum reduction * @param val input value diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp index 9d749ee47b..48094e3ccf 100644 --- a/cpp/include/raft/detail/mdarray.hpp +++ b/cpp/include/raft/detail/mdarray.hpp @@ -107,10 +107,13 @@ class device_uvector { /** * @brief Ctor that accepts a size, stream and an optional mr. */ - explicit device_uvector( - std::size_t size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(std::size_t size, rmm::cuda_stream_view stream) : data_{size, stream} {} + /** + * @brief Ctor that accepts a size, stream and a memory resource. + */ + explicit device_uvector(std::size_t size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) : data_{size, stream, mr} { } @@ -162,14 +165,10 @@ class device_uvector_policy { } device_uvector_policy() = delete; - explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept( - std::is_nothrow_copy_constructible_v) - : stream_{stream}, mr_(nullptr) - { - } - - device_uvector_policy(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) noexcept( - std::is_nothrow_copy_constructible_v) + explicit device_uvector_policy( + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = + nullptr) noexcept(std::is_nothrow_copy_constructible_v) : stream_{stream}, mr_(mr) { } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index 5fc56de14b..a2ce7598c6 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,4 +161,24 @@ std::enable_if_t::value, T> constexpr inline absolute_value(T return value; } +/** + * @defgroup Check whether the numeric conversion is narrowing + * + * @tparam From source type + * @tparam To destination type + * @{ + */ +template +struct is_narrowing : std::true_type { +}; + +template +struct is_narrowing()})>> : std::false_type { +}; +/** @} */ + +/** Check whether the numeric conversion is narrowing */ +template +inline constexpr bool is_narrowing_v = is_narrowing::value; // NOLINT + } // namespace raft diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh index 2ef2ae0fa4..befb5524ac 100644 --- a/cpp/include/raft/spatial/knn/ann.cuh +++ b/cpp/include/raft/spatial/knn/ann.cuh @@ -14,20 +14,14 @@ * limitations under the License. */ -#ifndef __ANN_H -#define __ANN_H - #pragma once #include "ann_common.h" -#include "detail/ann_quantized_faiss.cuh" +#include "detail/ann_quantized.cuh" -#include -#include +#include -namespace raft { -namespace spatial { -namespace knn { +namespace raft::spatial::knn { /** * @brief Flat C++ API function to build an approximate nearest neighbors index @@ -42,16 +36,19 @@ namespace knn { * @param[in] n number of rows in the index array * @param[in] D the dimensionality of the index array */ -template -inline void approx_knn_build_index(raft::handle_t& handle, - raft::spatial::knn::knnIndex* index, - knnIndexParam* params, - raft::distance::DistanceType metric, - float metricArg, - float* index_array, - value_idx n, - value_idx D) +template +[[deprecated("Consider using new-style raft::spatial::knn::*::build functions")]] inline void +approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + knnIndexParam* params, + raft::distance::DistanceType metric, + float metricArg, + T* index_array, + value_idx n, + value_idx D) { + common::nvtx::range fun_scope( + "legacy approx_knn_build_index(n_rows = %u, dim = %u)", n, D); detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } @@ -68,20 +65,19 @@ inline void approx_knn_build_index(raft::handle_t& handle, * @param[in] query_array the query to perform a search with * @param[in] n number of rows in the query array */ -template -inline void approx_knn_search(raft::handle_t& handle, - float* distances, - int64_t* indices, - raft::spatial::knn::knnIndex* index, - value_idx k, - float* query_array, - value_idx n) +template +[[deprecated("Consider using new-style raft::spatial::knn::*::search functions")]] inline void +approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + value_idx k, + T* query_array, + value_idx n) { + common::nvtx::range fun_scope( + "legacy approx_knn_search(k = %u, n_queries = %u)", k, n); detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } -} // namespace knn -} // namespace spatial -} // namespace raft - -#endif \ No newline at end of file +} // namespace raft::spatial::knn diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index b6d3ca2976..516435271d 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -13,79 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/** - * This file is deprecated and will be removed in release 22.06. - * Please use the cuh version instead. - */ - -#ifndef __ANN_H -#define __ANN_H #pragma once -#include "ann_common.h" -#include "detail/ann_quantized_faiss.cuh" - -#include -#include - -namespace raft { -namespace spatial { -namespace knn { - -/** - * @brief Flat C++ API function to build an approximate nearest neighbors index - * from an index array and a set of parameters. - * - * @param[in] handle RAFT handle - * @param[out] index index to be built - * @param[in] params parametrization of the index to be built - * @param[in] metric distance metric to use. Euclidean (L2) is used by default - * @param[in] metricArg metric argument - * @param[in] index_array the index array to build the index with - * @param[in] n number of rows in the index array - * @param[in] D the dimensionality of the index array - */ -template -inline void approx_knn_build_index(raft::handle_t& handle, - raft::spatial::knn::knnIndex* index, - knnIndexParam* params, - raft::distance::DistanceType metric, - float metricArg, - float* index_array, - value_idx n, - value_idx D) -{ - detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); -} - -/** - * @brief Flat C++ API function to perform an approximate nearest neighbors - * search from previously built index and a query array - * - * @param[in] handle RAFT handle - * @param[out] distances distances of the nearest neighbors toward - * their query point - * @param[out] indices indices of the nearest neighbors - * @param[in] index index to perform a search with - * @param[in] k the number of nearest neighbors to search for - * @param[in] query_array the query to perform a search with - * @param[in] n number of rows in the query array - */ -template -inline void approx_knn_search(raft::handle_t& handle, - float* distances, - int64_t* indices, - raft::spatial::knn::knnIndex* index, - value_idx k, - float* query_array, - value_idx n) -{ - detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); -} - -} // namespace knn -} // namespace spatial -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "ann.cuh" diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 5cdd6b1141..45867dbfee 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -14,8 +14,15 @@ * limitations under the License. */ +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the other approximate KNN implementations defined in spatial/knn/*.") + #pragma once +#include "detail/processing.hpp" +#include "ivf_flat_types.hpp" + #include #include @@ -26,19 +33,43 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex* index; raft::distance::DistanceType metric; float metricArg; + int nprobe; + std::unique_ptr index; + std::unique_ptr> metric_processor; + std::unique_ptr> ivf_flat_float_; + std::unique_ptr> ivf_flat_uint8_t_; + std::unique_ptr> ivf_flat_int8_t_; - raft::spatial::knn::RmmGpuResources* gpu_res; + std::unique_ptr gpu_res; int device; - ~knnIndex() - { - delete index; - delete gpu_res; - } + + template + auto ivf_flat() -> std::unique_ptr>&; }; +template <> +inline auto knnIndex::ivf_flat() + -> std::unique_ptr>& +{ + return ivf_flat_float_; +} + +template <> +inline auto knnIndex::ivf_flat() + -> std::unique_ptr>& +{ + return ivf_flat_uint8_t_; +} + +template <> +inline auto knnIndex::ivf_flat() + -> std::unique_ptr>& +{ + return ivf_flat_int8_t_; +} + enum QuantizerType : unsigned int { QT_8bit, QT_4bit, @@ -72,6 +103,17 @@ struct IVFSQParam : IVFParam { bool encodeResidual; }; +inline auto from_legacy_index_params(const IVFFlatParam& legacy, + raft::distance::DistanceType metric, + float metric_arg) +{ + ivf_flat::index_params params; + params.metric = metric; + params.metric_arg = metric_arg; + params.n_lists = legacy.nlist; + return params; +} + }; // namespace knn }; // namespace spatial }; // namespace raft diff --git a/cpp/include/raft/spatial/knn/common.hpp b/cpp/include/raft/spatial/knn/common.hpp new file mode 100644 index 0000000000..caaa951a66 --- /dev/null +++ b/cpp/include/raft/spatial/knn/common.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::spatial::knn { + +/** The base for approximate KNN index structures. */ +struct index { +}; + +/** The base for KNN index parameters. */ +struct index_params { + /** Distance type. */ + raft::distance::DistanceType metric = distance::DistanceType::L2Expanded; + /** The argument used by some distance metrics. */ + float metric_arg = 2.0f; + /** + * Whether to add the dataset content to the index, i.e.: + * + * - `true` means the index is filled with the dataset vectors and ready to search after calling + * `build`. + * - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but + * the index is left empty; you'd need to call `extend` on the index afterwards to populate it. + */ + bool add_data_on_build = true; +}; + +struct search_params { +}; + +}; // namespace raft::spatial::knn diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh new file mode 100644 index 0000000000..74e1ae75a8 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -0,0 +1,706 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "ann_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace raft::spatial::knn::detail::kmeans { + +/** + * @brief Predict labels for the dataset; floats only. + * + * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows + * * n_cluster * sizeof(float)). + * + * @param handle + * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim] + * @param n_clusters number of clusters/centers + * @param dim dimensionality of the data + * @param[in] dataset a pointer to the data [n_rows, dim] + * @param n_rows number samples in the `dataset` + * @param[out] labels output predictions [n_rows] + * @param metric + * @param stream + * @param mr (optional) memory resource to use for temporary allocations + */ +void predict_float_core(const handle_t& handle, + const float* centers, + uint32_t n_clusters, + uint32_t dim, + const float* dataset, + size_t n_rows, + uint32_t* labels, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + rmm::device_uvector distances(n_rows * n_clusters, stream, mr); + + float alpha; + float beta; + switch (metric) { + case raft::distance::DistanceType::InnerProduct: { + alpha = -1.0; + beta = 0.0; + } break; + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2Unexpanded: { + rmm::device_uvector sqsum_centers(n_clusters, stream, mr); + rmm::device_uvector sqsum_data(n_rows, stream, mr); + utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream); + utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream); + utils::outer_add( + sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream); + alpha = -2.0; + beta = 1.0; + } break; + // NB: update the description of `knn::ivf_flat::build` when adding here a new metric. + default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric)); + } + linalg::gemm(handle, + true, + false, + n_clusters, + n_rows, + dim, + &alpha, + centers, + dim, + dataset, + dim, + &beta, + distances.data(), + n_clusters, + stream); + utils::argmin_along_rows(n_rows, n_clusters, distances.data(), labels, stream); +} + +/** + * @brief Suggest a minibatch size for kmeans prediction. + * + * This function is used as a heuristic to split the work over a large dataset + * to reduce the size of temporary memory allocations. + * + * @param n_clusters number of clusters in kmeans clustering + * @param n_rows dataset size + * @return a suggested minibatch size + */ +constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t +{ + n_clusters = std::max(1, n_clusters); + uint32_t minibatch_size = (1 << 20); + if (minibatch_size > (1 << 28) / n_clusters) { + minibatch_size = (1 << 28) / n_clusters; + minibatch_size += 32; + minibatch_size -= minibatch_size % 64; + } + minibatch_size = uint32_t(std::min(minibatch_size, n_rows)); + return minibatch_size; +} + +/** + * @brief Given the data and labels, calculate cluster centers and sizes in one sweep. + * + * Let S_i = {x_k | x_k \in dataset & labels[k] == i} be the vectors in the dataset with label i. + * On exit centers_i = normalize(\sum_{x \in S_i} x), where `normalize` depends on the distance + * type. + * + * NB: `centers` and `cluster_sizes` must be accessible on GPU due to + * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are + * accessible from the same place. + * + * i.e. two variants are possible: + * + * 1. All pointers are on the device. + * 2. All pointers are on the host, but `centers` and `cluster_sizes` are accessible from GPU. + * + * @tparam T element type + * + * @param[inout] centers pointer to the output [n_clusters, dim] + * @param[inout] cluster_sizes number of rows in each cluster [n_clusters] + * @param n_clusters number of clusters/centers + * @param dim dimensionality of the data + * @param[in] dataset a pointer to the data [n_rows, dim] + * @param n_rows number samples in the `dataset` + * @param[in] labels output predictions [n_rows] + * @param reset_counters whether to clear the output arrays before calculating. + * When set to `false`, this function may be used to update existing centers and sizes using + * the weighted average principle. + * @param stream + */ +template +void calc_centers_and_sizes(float* centers, + uint32_t* cluster_sizes, + uint32_t n_clusters, + uint32_t dim, + const T* dataset, + size_t n_rows, + const uint32_t* labels, + bool reset_counters, + rmm::cuda_stream_view stream) +{ + if (reset_counters) { + utils::memzero(centers, n_clusters * dim, stream); + utils::memzero(cluster_sizes, n_clusters, stream); + } else { + utils::map_along_rows( + n_clusters, + dim, + centers, + cluster_sizes, + [] __device__(float c, uint32_t s) -> float { return c * s; }, + stream); + } + utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream); + utils::map_along_rows( + n_clusters, + dim, + centers, + cluster_sizes, + [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); }, + stream); +} + +/** + * @brief Predict labels for the dataset. + * + * @tparam T element type + * + * @param handle + * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim] + * @param n_clusters number of clusters/centers + * @param dim dimensionality of the data + * @param[in] dataset a pointer to the data [n_rows, dim] + * @param n_rows number samples in the `dataset` + * @param[out] labels output predictions [n_rows] + * @param metric + * @param stream + * @param mr (optional) memory resource to use for temporary allocations + */ + +template +void predict(const handle_t& handle, + const float* centers, + uint32_t n_clusters, + uint32_t dim, + const T* dataset, + size_t n_rows, + uint32_t* labels, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) +{ + common::nvtx::range fun_scope( + "kmeans::predict(%zu, %u)", n_rows, n_clusters); + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } + const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows); + rmm::device_uvector cur_dataset( + std::is_same_v ? 0 : max_minibatch_size * dim, stream, mr); + auto cur_dataset_ptr = cur_dataset.data(); + for (size_t offset = 0; offset < n_rows; offset += max_minibatch_size) { + auto minibatch_size = std::min(max_minibatch_size, n_rows - offset); + + if constexpr (std::is_same_v) { + cur_dataset_ptr = const_cast(dataset + offset * dim); + } else { + linalg::unaryOp(cur_dataset_ptr, + dataset + offset * dim, + minibatch_size * dim, + utils::mapping{}, + stream); + } + + predict_float_core(handle, + centers, + n_clusters, + dim, + cur_dataset_ptr, + minibatch_size, + labels + offset, + metric, + stream, + mr); + } +} + +/** + * @brief Adjust centers for clusters that have small number of entries. + * + * For each cluster, where the cluster size is not bigger than a threshold, the center is moved + * towards a data point that belongs to a large cluster. + * + * NB: if this function returns `true`, you should update the labels. + * + * NB: all pointers are used on the host side. + * + * @tparam T element type + * + * @param[inout] centers cluster centers [n_clusters, dim] + * @param n_clusters number of rows in `centers` + * @param dim number of columns in `centers` and `dataset` + * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim] + * @param n_rows number of rows in `dataset` + * @param[in] labels a host pointer to the cluster indices [n_rows] + * @param[in] cluster_sizes number of rows in each cluster [n_clusters] + * @param threshold defines a criterion for adjusting a cluster + * (cluster_sizes <= average_size * threshold) + * 0 <= threshold < 1 + * @param stream + * + * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated). + */ +template +auto adjust_centers(float* centers, + uint32_t n_clusters, + uint32_t dim, + const T* dataset, + size_t n_rows, + const uint32_t* labels, + const uint32_t* cluster_sizes, + float threshold, + rmm::cuda_stream_view stream) -> bool +{ + common::nvtx::range fun_scope( + "kmeans::adjust_centers(%zu, %u)", n_rows, n_clusters); + stream.synchronize(); + if (n_clusters == 0) { return false; } + constexpr static std::array kPrimes{29, 71, 113, 173, 229, 281, 349, 409, 463, 541, + 601, 659, 733, 809, 863, 941, 1013, 1069, 1151, 1223, + 1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987, + 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741}; + static size_t i = 0; + static size_t i_primes = 0; + + bool adjusted = false; + uint32_t average = static_cast(n_rows / size_t(n_clusters)); + uint32_t ofst; + + do { + i_primes = (i_primes + 1) % kPrimes.size(); + ofst = kPrimes[i_primes]; + } while (n_rows % ofst == 0); + + for (uint32_t l = 0; l < n_clusters; l++) { + auto csize = cluster_sizes[l]; + // skip big clusters + if (csize > static_cast(average * threshold)) continue; + // choose a "random" i that belongs to a rather large cluster + do { + i = (i + ofst) % n_rows; + } while (cluster_sizes[labels[i]] < average); + // Adjust the center of the selected smaller cluster to gravitate towards + // a sample from the selected larger cluster. + const size_t li = labels[i]; + // Weight of the current center for the weighted average. + // We dump it for anomalously small clusters, but keep constant overwise. + const float wc = std::min(csize, 7.0); + // Weight for the datapoint used to shift the center. + const float wd = 1.0; + for (uint32_t j = 0; j < dim; j++) { + float val = 0; + val += wc * centers[j + dim * li]; + val += wd * utils::mapping{}(dataset[j + size_t(dim) * i]); + val /= wc + wd; + centers[j + dim * l] = val; + } + adjusted = true; + } + stream.synchronize(); + return adjusted; +} + +/** predict & adjust_centers combined in an iterative process. */ +template +void build_clusters(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset, // managedl [n_rows, dim] + size_t n_rows, + uint32_t n_clusters, + float* cluster_centers, // managed; [n_clusters, dim] + uint32_t* cluster_labels, // managed; [n_rows] + uint32_t* cluster_sizes, // managed; [n_clusters] + raft::distance::DistanceType metric, + rmm::mr::device_memory_resource* device_memory, + rmm::cuda_stream_view stream) +{ + // "randomly initialize labels" + auto f = [n_clusters] __device__(uint32_t * out, size_t i) { + *out = uint32_t(i % size_t(n_clusters)); + }; + linalg::writeOnlyUnaryOp(cluster_labels, n_rows, f, stream); + + // update centers to match the initialized labels. + calc_centers_and_sizes( + cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream); + + for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) { + switch (metric) { + // For some metrics, cluster calculation and adjustment tends to favor zero center vectors. + // To avoid converging to zero, we normalize the center vectors on every iteration. + case raft::distance::DistanceType::InnerProduct: + case raft::distance::DistanceType::CosineExpanded: + case raft::distance::DistanceType::CorrelationExpanded: + utils::normalize_rows(n_clusters, dim, cluster_centers, stream); + default: break; + } + predict(handle, + cluster_centers, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + metric, + stream, + device_memory); + calc_centers_and_sizes(cluster_centers, + cluster_sizes, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + true, + stream); + + if (iter + 1 < 2 * n_iters) { + if (kmeans::adjust_centers(cluster_centers, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + cluster_sizes, + (float)1.0 / 4, + stream)) { + iter -= 1; + } + } + } +} + +/** Calculate how many fine clusters should belong to each mesocluster. */ +auto arrange_fine_clusters(uint32_t n_clusters, + uint32_t n_mesoclusters, + size_t n_rows, + const uint32_t* mesocluster_sizes) +{ + std::vector fine_clusters_nums(n_mesoclusters); + std::vector fine_clusters_csum(n_mesoclusters + 1); + fine_clusters_csum[0] = 0; + + uint32_t n_lists_rem = n_clusters; + uint32_t n_nonempty_ms_rem = 0; + for (uint32_t i = 0; i < n_mesoclusters; i++) { + n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0; + } + size_t n_rows_rem = n_rows; + size_t mesocluster_size_sum = 0; + uint32_t mesocluster_size_max = 0; + uint32_t fine_clusters_nums_max = 0; + for (uint32_t i = 0; i < n_mesoclusters; i++) { + if (i < n_mesoclusters - 1) { + // Although the algorithm is meant to produce balanced clusters, when something + // goes wrong, we may get empty clusters (e.g. during development/debugging). + // The code below ensures a proportional arrangement of fine cluster numbers + // per mesocluster, even if some clusters are empty. + if (mesocluster_sizes[i] == 0) { + fine_clusters_nums[i] = 0; + } else { + n_nonempty_ms_rem--; + auto s = uint32_t((double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5); + s = std::min(s, n_lists_rem - n_nonempty_ms_rem); + fine_clusters_nums[i] = std::max(s, 1); + } + } else { + fine_clusters_nums[i] = n_lists_rem; + } + n_lists_rem -= fine_clusters_nums[i]; + n_rows_rem -= mesocluster_sizes[i]; + mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]); + mesocluster_size_sum += mesocluster_sizes[i]; + fine_clusters_nums_max = max(fine_clusters_nums_max, fine_clusters_nums[i]); + fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i]; + } + + RAFT_EXPECTS(mesocluster_size_sum == n_rows, + "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)", + mesocluster_size_sum, + n_rows); + RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters, + "fine cluster numbers do not add up (%u) to the total number of clusters (%u)", + fine_clusters_csum[n_mesoclusters], + n_clusters + + ); + + return std::make_tuple(mesocluster_size_max, + fine_clusters_nums_max, + std::move(fine_clusters_nums), + std::move(fine_clusters_csum)); +} + +/** + * Given the (coarse) mesoclusters and the distribution of fine clusters within them, + * build the fine clusters. + * + * Processing one mesocluster at a time: + * 1. Copy mesocluster data into a separate buffer + * 2. Predict fine cluster + * 3. Refince the fine cluster centers + * + * As a result, the fine clusters are what is returned by `build_optimized_kmeans`; + * this function returns the total number of fine clusters, which can be checked to be + * the same as the requested number of clusters. + */ +template +auto build_fine_clusters(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset_mptr, + const uint32_t* labels_mptr, + size_t n_rows, + const uint32_t* fine_clusters_nums, + const uint32_t* fine_clusters_csum, + const uint32_t* mesocluster_sizes, + uint32_t n_mesoclusters, + uint32_t mesocluster_size_max, + uint32_t fine_clusters_nums_max, + float* cluster_centers, + raft::distance::DistanceType metric, + rmm::mr::managed_memory_resource* managed_memory, + rmm::mr::device_memory_resource* device_memory, + rmm::cuda_stream_view stream) -> uint32_t +{ + rmm::device_uvector mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory); + rmm::device_uvector mc_trainset_buf(mesocluster_size_max * dim, stream, managed_memory); + auto mc_trainset_ids = mc_trainset_ids_buf.data(); + auto mc_trainset = mc_trainset_buf.data(); + + // label (cluster ID) of each vector + rmm::device_uvector mc_trainset_labels(mesocluster_size_max, stream, managed_memory); + + rmm::device_uvector mc_trainset_ccenters( + fine_clusters_nums_max * dim, stream, managed_memory); + // number of vectors in each cluster + rmm::device_uvector mc_trainset_csizes_tmp( + fine_clusters_nums_max, stream, managed_memory); + + // Training clusters in each meso-cluster + uint32_t n_clusters_done = 0; + for (uint32_t i = 0; i < n_mesoclusters; i++) { + uint32_t k = 0; + for (size_t j = 0; j < n_rows; j++) { + if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; } + } + RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i); + if (k == 0) { + RAFT_LOG_DEBUG("Empty cluster %d", i); + RAFT_EXPECTS(fine_clusters_nums[i] == 0, + "Number of fine clusters must be zero for the empty mesocluster (got %d)", + fine_clusters_nums[i]); + continue; + } else { + RAFT_EXPECTS(fine_clusters_nums[i] > 0, + "Number of fine clusters must be non-zero for a non-empty mesocluster"); + } + + utils::copy_selected( + mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream); + + build_clusters(handle, + n_iters, + dim, + mc_trainset, + mesocluster_sizes[i], + fine_clusters_nums[i], + mc_trainset_ccenters.data(), + mc_trainset_labels.data(), + mc_trainset_csizes_tmp.data(), + metric, + device_memory, + stream); + + raft::copy(cluster_centers + (dim * fine_clusters_csum[i]), + mc_trainset_ccenters.data(), + fine_clusters_nums[i] * dim, + stream); + handle.sync_stream(stream); + n_clusters_done += fine_clusters_nums[i]; + } + return n_clusters_done; +} + +/** + * kmeans + * + * @tparam T element type + * + * @param handle + * @param n_iters number of training iterations + * @param dim number of columns in `centers` and `dataset` + * @param[in] dataset a device pointer to the source dataset [n_rows, dim] + * @param n_rows number of rows in the input + * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim] + * @param n_cluster + * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training; + * 0 < trainset_fraction <= 1. + * @param metric the distance metric + * @param stream + */ +template +void build_optimized_kmeans(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset, + size_t n_rows, + float* cluster_centers, + uint32_t n_clusters, + double trainset_fraction, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream) +{ + common::nvtx::range fun_scope( + "kmeans::build_optimized_kmeans(%zu, %u)", n_rows, n_clusters); + + auto trainset_ratio = + std::max(1, n_rows / std::max(trainset_fraction * n_rows, n_clusters)); + auto n_rows_train = n_rows / trainset_ratio; + + uint32_t n_mesoclusters = std::min(n_clusters, std::sqrt(n_clusters) + 0.5); + RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters); + + rmm::mr::managed_memory_resource managed_memory; + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource( + device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows_train) * dim * 4); + if (pool_guard) { + RAFT_LOG_DEBUG( + "kmeans::build_optimized_kmeans: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + rmm::device_uvector trainset(n_rows_train * dim, stream, &managed_memory); + // TODO: a proper sampling + RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), + sizeof(T) * dim, + dataset, + sizeof(T) * dim * trainset_ratio, + sizeof(T) * dim, + n_rows_train, + cudaMemcpyDefault, + stream)); + + // build coarse clusters (mesoclusters) + rmm::device_uvector mesocluster_labels_buf(n_rows_train, stream, &managed_memory); + rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory); + { + rmm::device_uvector mesocluster_centers_buf( + n_mesoclusters * dim, stream, &managed_memory); + build_clusters(handle, + n_iters, + dim, + trainset.data(), + n_rows_train, + n_mesoclusters, + mesocluster_centers_buf.data(), + mesocluster_labels_buf.data(), + mesocluster_sizes_buf.data(), + metric, + device_memory, + stream); + } + + auto mesocluster_sizes = mesocluster_sizes_buf.data(); + auto mesocluster_labels = mesocluster_labels_buf.data(); + + handle.sync_stream(stream); + + // build fine clusters + auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] = + arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes); + + if (mesocluster_size_max * n_mesoclusters > 2 * n_rows_train) { + RAFT_LOG_WARN("build_optimized_kmeans: built unbalanced mesoclusters"); + RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters); + RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters); + } + + auto n_clusters_done = build_fine_clusters(handle, + n_iters, + dim, + trainset.data(), + mesocluster_labels, + n_rows_train, + fine_clusters_nums.data(), + fine_clusters_csum.data(), + mesocluster_sizes, + n_mesoclusters, + mesocluster_size_max, + fine_clusters_nums_max, + cluster_centers, + metric, + &managed_memory, + device_memory, + stream); + RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters."); + + rmm::device_uvector cluster_sizes(n_clusters, stream, device_memory); + rmm::device_uvector labels(n_rows_train, stream, device_memory); + + // fit clusters using the trainset + for (int iter = 0; iter < 2; iter++) { + predict(handle, + cluster_centers, + n_clusters, + dim, + trainset.data(), + n_rows_train, + labels.data(), + metric, + stream, + device_memory); + calc_centers_and_sizes(cluster_centers, + cluster_sizes.data(), + n_clusters, + dim, + trainset.data(), + n_rows_train, + labels.data(), + true, + stream); + } +} + +} // namespace raft::spatial::knn::detail::kmeans diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh new file mode 100644 index 0000000000..5a56a84fe3 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../ann_common.h" +#include "../ivf_flat.cuh" +#include "knn_brute_force_faiss.cuh" + +#include "common_faiss.h" +#include "processing.cuh" +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::spatial::knn::detail { + +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) +{ + switch (qtype) { + case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit_uniform: + return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; + case QuantizerType::QT_4bit_uniform: + return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; + case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_8bit_direct: + return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; + case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: return (faiss::ScalarQuantizer::QuantizerType)qtype; + } +} + +template +void approx_knn_ivfflat_build_index(knnIndex* index, + const IVFFlatParam& params, + IntType n, + IntType D) +{ + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(index->metric); + index->index.reset( + new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config)); +} + +template +void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D) +{ + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = index->device; + config.usePrecomputedTables = params.usePrecomputedTables; + config.interleavedLayout = params.n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(index->metric); + index->index.reset(new faiss::gpu::GpuIndexIVFPQ( + index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config)); +} + +template +void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D) +{ + faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(index->metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype); + index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer( + index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual)); +} + +template +void approx_knn_build_index(const handle_t& handle, + knnIndex* index, + knnIndexParam* params, + raft::distance::DistanceType metric, + float metricArg, + T* index_array, + IntType n, + IntType D) +{ + auto stream = handle.get_stream(); + index->index = nullptr; + index->metric = metric; + index->metricArg = metricArg; + if (dynamic_cast(params)) { + index->nprobe = dynamic_cast(params)->nprobe; + } + auto ivf_ft_pams = dynamic_cast(params); + auto ivf_pq_pams = dynamic_cast(params); + auto ivf_sq_pams = dynamic_cast(params); + + if constexpr (std::is_same_v) { + index->metric_processor = create_processor(metric, n, D, 0, false, stream); + } + if constexpr (std::is_same_v) { index->metric_processor->preprocess(index_array); } + + if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded || + metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::InnerProduct)) { + auto new_params = from_legacy_index_params(*ivf_ft_pams, metric, metricArg); + index->ivf_flat() = std::make_unique>( + ivf_flat::build(handle, new_params, index_array, int64_t(n), D)); + } else { + RAFT_CUDA_TRY(cudaGetDevice(&(index->device))); + index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources()); + index->gpu_res->noTempMemory(); + index->gpu_res->setDefaultStream(index->device, stream); + if (ivf_ft_pams) { + approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D); + } else if (ivf_pq_pams) { + approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D); + } else if (ivf_sq_pams) { + approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D); + } else { + RAFT_FAIL("Unrecognized index type."); + } + if constexpr (std::is_same_v) { + index->index->train(n, index_array); + index->index->add(n, index_array); + } else { + RAFT_FAIL("FAISS-based index supports only float data."); + } + } + + if constexpr (std::is_same_v) { index->metric_processor->revert(index_array); } +} + +template +void approx_knn_search(const handle_t& handle, + float* distances, + int64_t* indices, + knnIndex* index, + IntType k, + T* query_array, + IntType n) +{ + auto faiss_ivf = dynamic_cast(index->index.get()); + if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); } + + if constexpr (std::is_same_v) { + index->metric_processor->preprocess(query_array); + index->metric_processor->set_num_queries(k); + } + + // search + if (faiss_ivf) { + if constexpr (std::is_same_v) { + faiss_ivf->search(n, query_array, k, distances, indices); + } else { + RAFT_FAIL("FAISS-based index supports only float data."); + } + } else if (index->ivf_flat()) { + ivf_flat::search_params params; + params.n_probes = index->nprobe; + ivf_flat::search( + handle, params, *(index->ivf_flat()), query_array, n, k, indices, distances); + } else { + RAFT_FAIL("The model is not trained"); + } + + // revert changes to the query + if constexpr (std::is_same_v) { index->metric_processor->revert(query_array); } + + // perform post-processing to show the real distances + if (index->metric == raft::distance::DistanceType::L2SqrtExpanded || + index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || + index->metric == raft::distance::DistanceType::LpUnexpanded) { + /** + * post-processing + */ + float p = 0.5; // standard l2 + if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; + raft::linalg::unaryOp( + distances, + distances, + n * k, + [p] __device__(float input) { return powf(input, p); }, + handle.get_stream()); + } + if constexpr (std::is_same_v) { index->metric_processor->postprocess(distances); } +} + +} // namespace raft::spatial::knn::detail diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh deleted file mode 100644 index 78631b431f..0000000000 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../ann_common.h" -#include "knn_brute_force_faiss.cuh" - -#include "common_faiss.h" -#include "processing.hpp" - -#include "processing.hpp" -#include -#include - -#include