From 4c089f681d96f4bcf927842c098eab84a0920925 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 11 Feb 2022 18:21:59 -0500 Subject: [PATCH] Moving remaining stats prims from cuml (#507) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/507 --- cpp/include/raft/common/seive.hpp | 125 +++++ cpp/include/raft/device_utils.cuh | 108 ++++ cpp/include/raft/random/detail/make_blobs.cuh | 133 +++-- .../knn/detail/epsilon_neighborhood.cuh | 243 +++++++++ .../raft/spatial/knn/epsilon_neighborhood.hpp | 59 +++ cpp/include/raft/stats/common.hpp | 67 +++ cpp/include/raft/stats/cov.hpp | 58 +++ cpp/include/raft/stats/detail/cov.cuh | 95 ++++ cpp/include/raft/stats/detail/histogram.cuh | 492 ++++++++++++++++++ cpp/include/raft/stats/detail/minmax.cuh | 239 +++++++++ .../raft/stats/detail/weighted_mean.cuh | 94 ++++ cpp/include/raft/stats/histogram.hpp | 62 +++ cpp/include/raft/stats/minmax.hpp | 70 +++ cpp/include/raft/stats/weighted_mean.hpp | 60 +++ cpp/test/CMakeLists.txt | 6 + cpp/test/common/seive.cu | 35 ++ cpp/test/linalg/rsvd.cu | 112 ++-- cpp/test/random/make_blobs.cu | 108 ++-- cpp/test/spatial/epsilon_neighborhood.cu | 129 +++++ cpp/test/stats/cov.cu | 185 +++++++ cpp/test/stats/histogram.cu | 262 ++++++++++ cpp/test/stats/minmax.cu | 202 +++++++ cpp/test/stats/weighted_mean.cu | 231 ++++++++ 23 files changed, 3027 insertions(+), 148 deletions(-) create mode 100644 cpp/include/raft/common/seive.hpp create mode 100644 cpp/include/raft/device_utils.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp create mode 100644 cpp/include/raft/stats/common.hpp create mode 100644 cpp/include/raft/stats/cov.hpp create mode 100644 cpp/include/raft/stats/detail/cov.cuh create mode 100644 cpp/include/raft/stats/detail/histogram.cuh create mode 100644 cpp/include/raft/stats/detail/minmax.cuh create mode 100644 cpp/include/raft/stats/detail/weighted_mean.cuh create mode 100644 cpp/include/raft/stats/histogram.hpp create mode 100644 cpp/include/raft/stats/minmax.hpp create mode 100644 cpp/include/raft/stats/weighted_mean.hpp create mode 100644 cpp/test/common/seive.cu create mode 100644 cpp/test/spatial/epsilon_neighborhood.cu create mode 100644 cpp/test/stats/cov.cu create mode 100644 cpp/test/stats/histogram.cu create mode 100644 cpp/test/stats/minmax.cu create mode 100644 cpp/test/stats/weighted_mean.cu diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp new file mode 100644 index 0000000000..e613f1e5c2 --- /dev/null +++ b/cpp/include/raft/common/seive.hpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +// Taken from: +// https://github.com/teju85/programming/blob/master/euler/include/seive.h + +namespace raft { +namespace common { + +/** + * @brief Implementation of 'Seive of Eratosthenes' + */ +class Seive { + public: + /** + * @param _num number of integers for which seive is needed + */ + Seive(unsigned _num) + { + N = _num; + generateSeive(); + } + + /** + * @brief Check whether a number is prime or not + * @param num number to be checked + * @return true if the 'num' is prime, else false + */ + bool isPrime(unsigned num) const + { + unsigned mask, pos; + if (num <= 1) { return false; } + if (num == 2) { return true; } + if (!(num & 1)) { return false; } + getMaskPos(num, mask, pos); + return (seive[pos] & mask); + } + + private: + void generateSeive() + { + auto sqN = fastIntSqrt(N); + auto size = raft::ceildiv(N, sizeof(unsigned) * 8); + seive.resize(size); + // assume all to be primes initially + for (auto& itr : seive) { + itr = 0xffffffffu; + } + unsigned cid = 0; + unsigned cnum = getNum(cid); + while (cnum <= sqN) { + do { + ++cid; + cnum = getNum(cid); + if (isPrime(cnum)) { break; } + } while (cnum <= sqN); + auto cnum2 = cnum << 1; + // 'unmark' all the 'odd' multiples of the current prime + for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) { + unmark(num); + } + } + } + + unsigned getId(unsigned num) const { return (num >> 1); } + + unsigned getNum(unsigned id) const + { + if (id == 0) { return 2; } + return ((id << 1) + 1); + } + + void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const + { + pos = getId(num); + mask = 1 << (pos & 0x1f); + pos >>= 5; + } + + void unmark(unsigned num) + { + unsigned mask, pos; + getMaskPos(num, mask, pos); + seive[pos] &= ~mask; + } + + // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf + unsigned fastIntSqrt(unsigned val) + { + unsigned g = 0; + auto bshft = 15u, b = 1u << bshft; + do { + unsigned temp = ((g << 1) + b) << bshft--; + if (val >= temp) { + g += b; + val -= temp; + } + } while (b >>= 1); + return g; + } + + /** find all primes till this number */ + unsigned N; + /** the seive */ + std::vector seive; +}; +}; // namespace common +}; // namespace raft diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh new file mode 100644 index 0000000000..d89a484109 --- /dev/null +++ b/cpp/include/raft/device_utils.cuh @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include // pair + +namespace raft { + +// TODO move to raft https://github.com/rapidsai/raft/issues/90 +/** helper method to get the compute capability version numbers */ +inline std::pair getDeviceCapability() +{ + int devId; + RAFT_CUDA_TRY(cudaGetDevice(&devId)); + int major, minor; + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId)); + return std::make_pair(major, minor); +} + +/** + * @brief Batched warp-level sum reduction + * + * @tparam T data type + * @tparam NThreads Number of threads in the warp doing independent reductions + * + * @param[in] val input value + * @return for the first "group" of threads, the reduced value. All + * others will contain unusable values! + * + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block and also doesn't support this kind of + * batched reduction operation + * @note All threads in the warp must enter this function together + * + * @todo Expand this to support arbitrary reduction ops + */ +template +DI T batchedWarpReduce(T val) +{ +#pragma unroll + for (int i = NThreads; i < raft::WarpSize; i <<= 1) { + val += raft::shfl(val, raft::laneId() + i); + } + return val; +} + +/** + * @brief 1-D block-level batched sum reduction + * + * @tparam T data type + * @tparam NThreads Number of threads in the warp doing independent reductions + * + * @param val input value + * @param smem shared memory region needed for storing intermediate results. It + * must alteast be of size: `sizeof(T) * nWarps * NThreads` + * @return for the first "group" of threads in the block, the reduced value. + * All others will contain unusable values! + * + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block and also doesn't support this kind of + * batched reduction operation + * @note All threads in the block must enter this function together + * + * @todo Expand this to support arbitrary reduction ops + */ +template +DI T batchedBlockReduce(T val, char* smem) +{ + auto* sTemp = reinterpret_cast(smem); + constexpr int nGroupsPerWarp = raft::WarpSize / NThreads; + static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!"); + const int nGroups = (blockDim.x + NThreads - 1) / NThreads; + const int lid = raft::laneId(); + const int lgid = lid % NThreads; + const int gid = threadIdx.x / NThreads; + const auto wrIdx = (gid / nGroupsPerWarp) * NThreads + lgid; + const auto rdIdx = gid * NThreads + lgid; + for (int i = nGroups; i > 0;) { + auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp; + if (gid < iAligned) { + val = batchedWarpReduce(val); + if (lid < NThreads) sTemp[wrIdx] = val; + } + __syncthreads(); + i /= nGroupsPerWarp; + if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); } + __syncthreads(); + } + return val; +} + +} // namespace raft diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh index 528d20a284..fff1ab835b 100644 --- a/cpp/include/raft/random/detail/make_blobs.cuh +++ b/cpp/include/raft/random/detail/make_blobs.cuh @@ -16,18 +16,18 @@ #pragma once +#include "permute.cuh" #include #include #include -#include #include #include #include -namespace raft::random { -namespace detail { +namespace raft { +namespace random { -namespace { +namespace detail { // generate the labels first and shuffle them instead of shuffling the dataset template @@ -90,23 +90,29 @@ DI void get_mu_sigma(DataT& mu, } template -void generate_data(DataT* out, - const IdxT* labels, - IdxT n_rows, - IdxT n_cols, - IdxT n_clusters, - cudaStream_t stream, - bool row_major, - const DataT* centers, - const DataT* cluster_std, - const DataT cluster_std_scalar, - raft::random::Rng& rng) +__global__ void generate_data_kernel(DataT* out, + const IdxT* labels, + IdxT n_rows, + IdxT n_cols, + IdxT n_clusters, + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + raft::random::RngState rng_state) { - auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) { + uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + raft::random::PhiloxGenerator gen(rng_state, tid); + const IdxT stride = gridDim.x * blockDim.x; + IdxT len = n_rows * n_cols; + for (IdxT idx = tid; idx < len; idx += stride) { + DataT val1, val2; + gen.next(val1); + gen.next(val2); DataT mu1, sigma1, mu2, sigma2; get_mu_sigma(mu1, sigma1, - idx1, + idx, labels, row_major, centers, @@ -117,7 +123,7 @@ void generate_data(DataT* out, n_clusters); get_mu_sigma(mu2, sigma2, - idx2, + idx + stride, labels, row_major, centers, @@ -127,12 +133,74 @@ void generate_data(DataT* out, n_cols, n_clusters); raft::random::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2); - }; - rng.custom_distribution2(out, n_rows * n_cols, op, stream); + + if (idx < len) out[idx] = val1; + idx += stride; + if (idx < len) out[idx] = val2; + } } -} // namespace +template +void generate_data(DataT* out, + const IdxT* labels, + IdxT n_rows, + IdxT n_cols, + IdxT n_clusters, + cudaStream_t stream, + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + raft::random::RngState& rng_state) +{ + IdxT items = n_rows * n_cols; + IdxT nBlocks = (items + 127) / 128; + generate_data_kernel<<>>(out, + labels, + n_rows, + n_cols, + n_clusters, + row_major, + centers, + cluster_std, + cluster_std_scalar, + rng_state); +} +/** + * @brief GPU-equivalent of sklearn.datasets.make_blobs + * + * @tparam DataT output data type + * @tparam IdxT indexing arithmetic type + * + * @param[out] out generated data [on device] + * [dim = n_rows x n_cols] + * @param[out] labels labels for the generated data [on device] + * [len = n_rows] + * @param[in] n_rows number of rows in the generated data + * @param[in] n_cols number of columns in the generated data + * @param[in] n_clusters number of clusters (or classes) to generate + * @param[in] stream cuda stream to schedule the work on + * @param[in] row_major whether input `centers` and output `out` + * buffers are to be stored in row or column + * major layout + * @param[in] centers centers of each of the cluster, pass a nullptr + * if you need this also to be generated randomly + * [on device] [dim = n_clusters x n_cols] + * @param[in] cluster_std standard deviation of each cluster center, + * pass a nullptr if this is to be read from the + * `cluster_std_scalar`. [on device] + * [len = n_clusters] + * @param[in] cluster_std_scalar if 'cluster_std' is nullptr, then use this as + * the std-dev across all dimensions. + * @param[in] shuffle shuffle the generated dataset and labels + * @param[in] center_box_min min value of box from which to pick cluster + * centers. Useful only if 'centers' is nullptr + * @param[in] center_box_max max value of box from which to pick cluster + * centers. Useful only if 'centers' is nullptr + * @param[in] seed seed for the RNG + * @param[in] type RNG type + */ template void make_blobs_caller(DataT* out, IdxT* labels, @@ -140,15 +208,15 @@ void make_blobs_caller(DataT* out, IdxT n_cols, IdxT n_clusters, cudaStream_t stream, - bool row_major = true, - const DataT* centers = nullptr, - const DataT* cluster_std = nullptr, - const DataT cluster_std_scalar = (DataT)1.0, - bool shuffle = true, - DataT center_box_min = (DataT)-10.0, - DataT center_box_max = (DataT)10.0, - uint64_t seed = 0ULL, - raft::random::GeneratorType type = raft::random::GenPhilox) + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + bool shuffle, + DataT center_box_min, + DataT center_box_max, + uint64_t seed, + raft::random::GeneratorType type) { raft::random::Rng r(seed, type); // use the right centers buffer for data generation @@ -172,8 +240,9 @@ void make_blobs_caller(DataT* out, _centers, cluster_std, cluster_std_scalar, - r); + r.state); } } // end namespace detail -} // end namespace raft::random \ No newline at end of file +} // end namespace random +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh new file mode 100644 index 0000000000..3b4a8d4174 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace spatial { +namespace knn { +namespace detail { + +template > +struct EpsUnexpL2SqNeighborhood : public BaseClass { + private: + typedef Policy P; + + bool* adj; + DataT eps; + IdxT* vd; + + char* smem; // for final reductions + + DataT acc[P::AccRowsPerTh][P::AccColsPerTh]; + + public: + DI EpsUnexpL2SqNeighborhood(bool* _adj, + IdxT* _vd, + const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + DataT _eps, + char* _smem) + : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) + { + } + + DI void run() + { + prolog(); + loop(); + epilog(); + } + + private: + DI void prolog() + { + this->ldgXY(0); +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + acc[i][j] = BaseClass::Zero; + } + } + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + } + + DI void loop() + { + for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { + this->ldgXY(kidx); + accumulate(); // on the previous k-block + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + this->pageRd ^= 1; + } + accumulate(); // last iteration + } + + DI void epilog() + { + IdxT startx = blockIdx.x * P::Mblk + this->accrowid; + IdxT starty = blockIdx.y * P::Nblk + this->acccolid; + auto lid = raft::laneId(); + IdxT sums[P::AccColsPerTh]; +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + sums[j] = 0; + } +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto xid = startx + i * P::AccThRows; +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + auto yid = starty + j * P::AccThCols; + auto is_neigh = acc[i][j] <= eps; + ///@todo: fix uncoalesced writes using shared mem + if (xid < this->m && yid < this->n) { + adj[xid * this->n + yid] = is_neigh; + sums[j] += is_neigh; + } + } + } + // perform reduction of adjacency values to compute vertex degrees + if (vd != nullptr) { updateVertexDegree(sums); } + } + + DI void accumulate() + { +#pragma unroll + for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { + this->ldsXY(ki); +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { +#pragma unroll + for (int v = 0; v < P::Veclen; ++v) { + auto diff = this->regx[i][v] - this->regy[j][v]; + acc[i][j] += diff * diff; + } + } + } + } + } + + DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) + { + __syncthreads(); // so that we can safely reuse smem + int gid = threadIdx.x / P::AccThCols; + int lid = threadIdx.x % P::AccThCols; + auto cidx = IdxT(blockIdx.y) * P::Nblk + lid; + IdxT totalSum = 0; + // update the individual vertex degrees +#pragma unroll + for (int i = 0; i < P::AccColsPerTh; ++i) { + sums[i] = batchedBlockReduce(sums[i], smem); + auto cid = cidx + i * P::AccThCols; + if (gid == 0 && cid < this->n) { + atomicUpdate(cid, sums[i]); + totalSum += sums[i]; + } + __syncthreads(); // for safe smem reuse + } + // update the total edge count + totalSum = raft::blockReduce(totalSum, smem); + if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); } + } + + DI void atomicUpdate(IdxT addrId, IdxT val) + { + if (sizeof(IdxT) == 4) { + raft::myAtomicAdd((unsigned*)(vd + addrId), val); + } else if (sizeof(IdxT) == 8) { + raft::myAtomicAdd((unsigned long long*)(vd + addrId), val); + } + } +}; // struct EpsUnexpL2SqNeighborhood + +template +__global__ __launch_bounds__(Policy::Nthreads, 2) + + void epsUnexpL2SqNeighKernel( + bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps) +{ + extern __shared__ char smem[]; + EpsUnexpL2SqNeighborhood obj(adj, vd, x, y, m, n, k, eps, smem); + obj.run(); +} + +template +void epsUnexpL2SqNeighImpl(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) +{ + typedef typename raft::linalg::Policy4x4::Policy Policy; + dim3 grid(raft::ceildiv(m, Policy::Mblk), raft::ceildiv(n, Policy::Nblk)); + dim3 blk(Policy::Nthreads); + epsUnexpL2SqNeighKernel + <<>>(adj, vd, x, y, m, n, k, eps); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +/** + * @brief Computes epsilon neighborhood for the L2-Squared distance metric + * + * @tparam DataT IO and math type + * @tparam IdxT Index type + * + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] x first matrix [row-major] [on device] [dim = m x k] + * @param[in] y second matrix [row-major] [on device] [dim = n x k] + * @param[in] eps defines epsilon neighborhood radius (should be passed as + * squared as we compute L2-squared distance in this method) + * @param[in] fop device lambda to do any other custom functions + * @param[in] stream cuda stream + */ +template +void epsUnexpL2SqNeighborhood(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) +{ + size_t bytes = sizeof(DataT) * k; + if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } +} +} // namespace detail +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp new file mode 100644 index 0000000000..cd9163096a --- /dev/null +++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace spatial { +namespace knn { + +/** + * @brief Computes epsilon neighborhood for the L2-Squared distance metric + * + * @tparam DataT IO and math type + * @tparam IdxT Index type + * + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] x first matrix [row-major] [on device] [dim = m x k] + * @param[in] y second matrix [row-major] [on device] [dim = n x k] + * @param[in] m number of rows in x + * @param[in] n number of rows in y + * @param[in] k number of columns in x and k + * @param[in] eps defines epsilon neighborhood radius (should be passed as + * squared as we compute L2-squared distance in this method) + * @param[in] stream cuda stream + */ +template +void epsUnexpL2SqNeighborhood(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) +{ + detail::epsUnexpL2SqNeighborhood(adj, vd, x, y, m, n, k, eps, stream); +} +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp new file mode 100644 index 0000000000..765f07a012 --- /dev/null +++ b/cpp/include/raft/stats/common.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { + +/** Default mapper which just returns the value of the data itself */ +template +struct IdentityBinner { + DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } +}; + +/** Types of support histogram implementations */ +enum HistType { + /** shared mem atomics but with bins to be 1b int's */ + HistTypeSmemBits1 = 1, + /** shared mem atomics but with bins to be 2b int's */ + HistTypeSmemBits2 = 2, + /** shared mem atomics but with bins to be 4b int's */ + HistTypeSmemBits4 = 4, + /** shared mem atomics but with bins to ba 1B int's */ + HistTypeSmemBits8 = 8, + /** shared mem atomics but with bins to be 2B int's */ + HistTypeSmemBits16 = 16, + /** use only global atomics */ + HistTypeGmem, + /** uses shared mem atomics to reduce global traffic */ + HistTypeSmem, + /** + * uses shared mem atomics with match_any intrinsic to further reduce shared + * memory traffic. This can only be enabled on Volta and later architectures. + * If one tries to enable this for older arch's, it will fall back to + * `HistTypeSmem`. + * @note This is to be used only when the input dataset leads to a lot of + * repetitions in a given warp, else, this algo can be much slower than + * `HistTypeSmem`! + */ + HistTypeSmemMatchAny, + /** builds a hashmap of active bins in shared mem */ + HistTypeSmemHash, + /** decide at runtime the best algo for the given inputs */ + HistTypeAuto +}; +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp new file mode 100644 index 0000000000..dc5bc63ee8 --- /dev/null +++ b/cpp/include/raft/stats/cov.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +namespace raft { +namespace stats { +/** + * @brief Compute covariance of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @param covar the output covariance matrix + * @param data the input matrix (this will get mean-centered at the end!) + * @param mu mean vector of the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample covariance or not. In other words, + * whether to normalize the output using N-1 or N, for true or false, + * respectively + * @param rowMajor whether the input data is row or col major + * @param stable whether to run the slower-but-numerically-stable version or not + * @param handle cublas handle + * @param stream cuda stream + * @note if stable=true, then the input data will be mean centered after this + * function returns! + */ +template +void cov(const raft::handle_t& handle, + Type* covar, + Type* data, + const Type* mu, + std::size_t D, + std::size_t N, + bool sample, + bool rowMajor, + bool stable, + cudaStream_t stream) +{ + detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream); +} +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh new file mode 100644 index 0000000000..7e3fc701a1 --- /dev/null +++ b/cpp/include/raft/stats/detail/cov.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace stats { +namespace detail { +/** + * @brief Compute covariance of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @param covar the output covariance matrix + * @param data the input matrix (this will get mean-centered at the end!) + * @param mu mean vector of the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample covariance or not. In other words, + * whether to normalize the output using N-1 or N, for true or false, + * respectively + * @param rowMajor whether the input data is row or col major + * @param stable whether to run the slower-but-numerically-stable version or not + * @param handle cublas handle + * @param stream cuda stream + * @note if stable=true, then the input data will be mean centered after this + * function returns! + */ +template +void cov(const raft::handle_t& handle, + Type* covar, + Type* data, + const Type* mu, + std::size_t D, + std::size_t N, + bool sample, + bool rowMajor, + bool stable, + cudaStream_t stream) +{ + if (stable) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + // since mean operation is assumed to be along a given column, broadcast + // must be along rows! + raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream); + Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N)); + Type beta = Type(0); + if (rowMajor) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_T, + D, + D, + N, + &alpha, + data, + D, + data, + D, + &beta, + covar, + D, + stream)); + } else { + raft::linalg::gemm( + handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + } + } else { + ///@todo: implement this using cutlass + customized epilogue! + ASSERT(false, "cov: Implement stable=false case!"); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh new file mode 100644 index 0000000000..65241f524f --- /dev/null +++ b/cpp/include/raft/stats/detail/histogram.cuh @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { +namespace detail { + +static const int ThreadsPerBlock = 256; + +template +dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel) +{ + int occupancy; + RAFT_CUDA_TRY( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0)); + const auto maxBlks = occupancy * raft::getMultiProcessorCount(); + int nblksx = raft::ceildiv(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock); + // for cases when there aren't a lot of blocks for computing one histogram + nblksx = std::min(nblksx, maxBlks); + return dim3(nblksx, ncols); +} + +template +DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col) +{ + IdxT offset = col * nrows; + auto bdim = IdxT(blockDim.x); + IdxT tid = threadIdx.x + bdim * blockIdx.x; + tid *= VecLen; + IdxT stride = bdim * gridDim.x * VecLen; + int nCeil = raft::alignTo(nrows, stride); + typedef raft::TxN_t VecType; + VecType a; + for (auto i = tid; i < nCeil; i += stride) { + if (i < nrows) { a.load(data, offset + i); } +#pragma unroll + for (int j = 0; j < VecLen; ++j) { + int binId = binner(a.val.data[j], i + j, col); + op(binId, i + j, col); + } + } +} + +template +__global__ void gmemHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; + auto binOffset = col * nbins; +#if __CUDA_ARCH__ < 700 + raft::myAtomicAdd(bins + binOffset + binId, 1); +#else + auto amask = __activemask(); + auto mask = __match_any_sync(amask, binId); + auto leader = __ffs(mask) - 1; + if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); } +#endif // __CUDA_ARCH__ + }; + histCoreOp(data, nrows, nbins, binner, op, blockIdx.y); +} + +template +void gmemHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + auto blks = computeGridDim( + nrows, ncols, (const void*)gmemHistKernel); + gmemHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +template +__global__ void smemHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + extern __shared__ unsigned sbins[]; + for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { + sbins[i] = 0; + } + __syncthreads(); + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; +#if __CUDA_ARCH__ < 700 + raft::myAtomicAdd(sbins + binId, 1); +#else + if (UseMatchAny) { + auto amask = __activemask(); + auto mask = __match_any_sync(amask, binId); + auto leader = __ffs(mask) - 1; + if (raft::laneId() == leader) { + raft::myAtomicAdd(sbins + binId, __popc(mask)); + } + } else { + raft::myAtomicAdd(sbins + binId, 1); + } +#endif // __CUDA_ARCH__ + }; + IdxT col = blockIdx.y; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + auto binOffset = col * nbins; + for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { + auto val = sbins[i]; + if (val > 0) { raft::myAtomicAdd((unsigned int*)bins + binOffset + i, val); } + } +} + +template +void smemHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + auto blks = computeGridDim( + nrows, ncols, (const void*)smemHistKernel); + size_t smemSize = nbins * sizeof(unsigned); + smemHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +template +struct BitsInfo { + static unsigned const BIN_BITS = _BIN_BITS; + static unsigned const WORD_BITS = sizeof(unsigned) * 8; + static unsigned const WORD_BINS = WORD_BITS / BIN_BITS; + static unsigned const BIN_MASK = (1 << BIN_BITS) - 1; +}; + +template +DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId) +{ + typedef BitsInfo Bits; + auto iword = binId / Bits::WORD_BINS; + auto ibin = binId % Bits::WORD_BINS; + auto sh = ibin * Bits::BIN_BITS; + auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh)); + auto new_word = old_word + unsigned(1 << sh); + if ((new_word >> sh & Bits::BIN_MASK) != 0) return; + // overflow + raft::myAtomicAdd((unsigned int*)bins + binId, Bits::BIN_MASK + 1); + for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) { + auto sh1 = (ibin + dbin) * Bits::BIN_BITS; + if ((new_word >> sh1 & Bits::BIN_MASK) == 0) { + // overflow + raft::myAtomicAdd((unsigned int*)bins + binId + dbin, Bits::BIN_MASK); + } else { + // correction + raft::myAtomicAdd(bins + binId + dbin, -1); + break; + } + } +} + +template <> +DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId) +{ + typedef BitsInfo<1> Bits; + auto iword = binId / Bits::WORD_BITS; + auto sh = binId % Bits::WORD_BITS; + auto old_word = atomicXor(sbins + iword, unsigned(1 << sh)); + if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2); +} + +template +__global__ void smemBitsHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + extern __shared__ unsigned sbins[]; + typedef BitsInfo Bits; + auto nwords = raft::ceildiv(nbins, Bits::WORD_BINS); + for (auto j = threadIdx.x; j < nwords; j += blockDim.x) { + sbins[j] = 0; + } + __syncthreads(); + IdxT col = blockIdx.y; + IdxT binOffset = col * nbins; + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; + incrementBin(sbins, bins + binOffset, (int)nbins, binId); + }; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) { + auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS; + int count = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK; + if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count); + } +} + +template +void smemBitsHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + typedef BitsInfo Bits; + auto blks = computeGridDim( + nrows, ncols, (const void*)smemBitsHistKernel); + size_t smemSize = raft::ceildiv(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int); + smemBitsHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +#define INVALID_KEY -1 + +DI void clearHashTable(int2* ht, int hashSize) +{ + for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { + ht[i] = {INVALID_KEY, 0}; + } +} + +DI int findEntry(int2* ht, int hashSize, int binId, int threshold) +{ + int idx = binId % hashSize; + int t; + int count = 0; + while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) { + ++count; + if (count >= threshold) { + idx = INVALID_KEY; + break; + } + ++idx; + if (idx >= hashSize) { idx = 0; } + } + return idx; +} + +DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col) +{ + int binOffset = col * nbins; + for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { + if (ht[i].x != INVALID_KEY && ht[i].y > 0) { + raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y); + } + ht[i] = {INVALID_KEY, 0}; + } +} + +#undef INVALID_KEY + +///@todo: honor VecLen template param +template +__global__ void smemHashHistKernel(int* bins, + const DataT* data, + IdxT nrows, + IdxT nbins, + BinnerOp binner, + int hashSize, + int threshold) +{ + extern __shared__ int2 ht[]; + int* needFlush = (int*)&(ht[hashSize]); + if (threadIdx.x == 0) { needFlush[0] = 0; } + clearHashTable(ht, hashSize); + __syncthreads(); + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + bool iNeedFlush = false; + if (row < nrows) { + int hidx = findEntry(ht, hashSize, binId, threshold); + if (hidx >= 0) { + raft::myAtomicAdd(&(ht[hidx].y), 1); + } else { + needFlush[0] = 1; + iNeedFlush = true; + } + } + __syncthreads(); + if (needFlush[0]) { + flushHashTable(ht, hashSize, bins, nbins, col); + __syncthreads(); + if (threadIdx.x == 0) { needFlush[0] = 0; } + __syncthreads(); + } + if (iNeedFlush) { + int hidx = findEntry(ht, hashSize, binId, threshold); + // all threads are bound to get one valid entry as all threads in this + // block will make forward progress due to the __syncthreads call in the + // subsequent iteration + raft::myAtomicAdd(&(ht[hidx].y), 1); + } + }; + IdxT col = blockIdx.y; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + flushHashTable(ht, hashSize, bins, nbins, col); +} + +inline int computeHashTableSize() +{ + // we shouldn't have this much of shared memory available anytime soon! + static const unsigned maxBinsEverPossible = 256 * 1024; + static raft::common::Seive primes(maxBinsEverPossible); + unsigned smem = raft::getSharedMemPerBlock(); + // divide-by-2 because hash table entry stores 2 elements: idx and count + auto binsPossible = smem / sizeof(unsigned) / 2; + for (; binsPossible > 1; --binsPossible) { + if (primes.isPrime(binsPossible)) return (int)binsPossible; + } + return 1; // should not happen! +} + +template +void smemHashHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + static const int flushThreshold = 10; + auto blks = computeGridDim( + nrows, ncols, (const void*)smemHashHistKernel); + int hashSize = computeHashTableSize(); + size_t smemSize = hashSize * sizeof(int2) + sizeof(int); + smemHashHistKernel<<>>( + bins, data, nrows, nbins, binner, hashSize, flushThreshold); +} + +template +void histogramVecLen(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner) +{ + RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream)); + switch (type) { + case HistTypeGmem: + gmemHist(bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmem: + smemHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemMatchAny: + smemHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits16: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits8: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits4: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits2: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits1: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemHash: + smemHashHist(bins, nbins, data, nrows, ncols, binner, stream); + break; + default: ASSERT(false, "histogram: Invalid type passed '%d'!", type); + }; + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void histogramImpl(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner) +{ + size_t bytes = nrows * sizeof(DataT); + if (nrows <= 0) return; + if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } +} + +template +HistType selectBestHistAlgo(IdxT nbins) +{ + size_t smem = raft::getSharedMemPerBlock(); + size_t requiredSize = nbins * sizeof(unsigned); + if (requiredSize <= smem) { return HistTypeSmem; } + for (int bits = 16; bits >= 1; bits >>= 1) { + auto nBytesForBins = raft::ceildiv(bits * nbins, 8); + requiredSize = raft::alignTo(nBytesForBins, sizeof(unsigned)); + if (requiredSize <= smem) { return static_cast(bits); } + } + return HistTypeGmem; +} + +/** + * @brief Perform histogram on the input data. It chooses the right load size + * based on the input data vector length. It also supports large-bin cases + * using a specialized smem-based hashing technique. + * @tparam DataT input data type + * @tparam IdxT data type used to compute indices + * @tparam BinnerOp takes the input data and computes its bin index + * @param type histogram implementation type to choose + * @param bins the output bins (length = ncols * nbins) + * @param nbins number of bins + * @param data input data (length = ncols * nrows) + * @param nrows data array length in each column (or batch) + * @param ncols number of columsn (or batch size) + * @param stream cuda stream + * @param binner the operation that computes the bin index of the input data + * + * @note signature of BinnerOp is `int func(DataT, IdxT);` + */ +template > +void histogram(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner = IdentityBinner()) +{ + HistType computedType = type; + if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); } + histogramImpl( + computedType, bins, nbins, data, nrows, ncols, stream, binner); +} + +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh new file mode 100644 index 0000000000..2a4a9bff93 --- /dev/null +++ b/cpp/include/raft/stats/detail/minmax.cuh @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace raft { +namespace stats { +namespace detail { + +// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it +template +constexpr To bit_cast(const From& from) noexcept +{ + To to{}; + static_assert(sizeof(To) == sizeof(From)); + memcpy(&to, &from, sizeof(To)); + return to; +} + +template +struct encode_traits { +}; + +template <> +struct encode_traits { + using E = int; +}; + +template <> +struct encode_traits { + using E = long long; +}; + +HDI int encode(float val) +{ + int i = detail::bit_cast(val); + return i >= 0 ? i : (1 << 31) | ~i; +} + +HDI long long encode(double val) +{ + std::int64_t i = detail::bit_cast(val); + return i >= 0 ? i : (1ULL << 63) | ~i; +} + +HDI float decode(int val) +{ + if (val < 0) val = (1 << 31) | ~val; + return detail::bit_cast(val); +} + +HDI double decode(long long val) +{ + if (val < 0) val = (1ULL << 63) | ~val; + return detail::bit_cast(val); +} + +template +DI T atomicMaxBits(T* address, T val) +{ + E old = atomicMax((E*)address, encode(val)); + return decode(old); +} + +template +DI T atomicMinBits(T* address, T val) +{ + E old = atomicMin((E*)address, encode(val)); + return decode(old); +} + +template +__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < ncols) { + globalmin[tid] = decode(*(E*)&globalmin[tid]); + globalmax[tid] = decode(*(E*)&globalmax[tid]); + } +} + +///@todo: implement a proper "fill" kernel +template +__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= ncols) return; + *(E*)&globalmin[tid] = encode(init_val); + *(E*)&globalmax[tid] = encode(-init_val); +} + +template +__global__ void minmaxKernel(const T* data, + const unsigned int* rowids, + const unsigned int* colids, + int nrows, + int ncols, + int row_stride, + T* g_min, + T* g_max, + T* sampledcols, + T init_min_val, + int batch_ncols, + int num_batches) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T* s_min = (T*)shmem; + T* s_max = (T*)(shmem + sizeof(T) * batch_ncols); + + int last_batch_ncols = ncols % batch_ncols; + if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; } + int orig_batch_ncols = batch_ncols; + + for (int batch_id = 0; batch_id < num_batches; batch_id++) { + if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; } + + for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) { + *(E*)&s_min[i] = encode(init_min_val); + *(E*)&s_max[i] = encode(-init_min_val); + } + __syncthreads(); + + for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) { + int col = (batch_id * orig_batch_ncols) + (i / nrows); + int row = i % nrows; + if (colids != nullptr) { col = colids[col]; } + if (rowids != nullptr) { row = rowids[row]; } + int index = row + col * row_stride; + T coldata = data[index]; + if (!isnan(coldata)) { + // Min max values are saved in shared memory and global memory as per the shuffled colids. + atomicMinBits(&s_min[(int)(i / nrows)], coldata); + atomicMaxBits(&s_max[(int)(i / nrows)], coldata); + } + if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; } + } + __syncthreads(); + + // finally, perform global mem atomics + for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) { + atomicMinBits(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j])); + atomicMaxBits(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j])); + } + __syncthreads(); + } +} + +/** + * @brief Computes min/max across every column of the input matrix, as well as + * optionally allow to subsample based on the given row/col ID mapping vectors + * + * @tparam T the data type + * @tparam TPB number of threads per block + * @param data input data + * @param rowids actual row ID mappings. It is of length nrows. If you want to + * skip this index lookup entirely, pass nullptr + * @param colids actual col ID mappings. It is of length ncols. If you want to + * skip this index lookup entirely, pass nullptr + * @param nrows number of rows of data to be worked upon. The actual rows of the + * input "data" can be bigger than this! + * @param ncols number of cols of data to be worked upon. The actual cols of the + * input "data" can be bigger than this! + * @param row_stride stride (in number of elements) between 2 adjacent columns + * @param globalmin final col-wise global minimum (size = ncols) + * @param globalmax final col-wise global maximum (size = ncols) + * @param sampledcols output sampled data. Pass nullptr if you don't need this + * @param stream cuda stream + * @note This method makes the following assumptions: + * 1. input and output matrices are assumed to be col-major + * 2. ncols is small enough to fit the whole of min/max values across all cols + * in shared memory + */ +template +void minmax(const T* data, + const unsigned* rowids, + const unsigned* colids, + int nrows, + int ncols, + int row_stride, + T* globalmin, + T* globalmax, + T* sampledcols, + cudaStream_t stream) +{ + using E = typename encode_traits::E; + int nblks = raft::ceildiv(ncols, TPB); + T init_val = std::numeric_limits::max(); + minmaxInitKernel<<>>(ncols, globalmin, globalmax, init_val); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + nblks = raft::ceildiv(nrows * ncols, TPB); + nblks = min(nblks, 65536); + size_t smemSize = sizeof(T) * 2 * ncols; + + // Compute the batch_ncols, in [1, ncols] range, that meet the available + // shared memory constraints. + auto smemPerBlk = raft::getSharedMemPerBlock(); + int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2))); + int num_batches = raft::ceildiv(ncols, batch_ncols); + smemSize = sizeof(T) * 2 * batch_ncols; + + minmaxKernel<<>>(data, + rowids, + colids, + nrows, + ncols, + row_stride, + globalmin, + globalmax, + sampledcols, + init_val, + batch_ncols, + num_batches); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + decodeKernel<<>>(globalmin, globalmax, ncols); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh new file mode 100644 index 0000000000..ca7fc136d3 --- /dev/null +++ b/cpp/include/raft/stats/detail/weighted_mean.cuh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace stats { +namespace detail { + +/** + * @brief Compute the row-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be row-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void rowWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + // sum the weights & copy back to CPU + Type WS = 0; + raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false); + raft::update_host(&WS, mu, 1, stream); + + raft::linalg::coalescedReduction( + mu, + data, + D, + N, + (Type)0, + stream, + false, + [weights] __device__(Type v, int i) { return v * weights[i]; }, + [] __device__(Type a, Type b) { return a + b; }, + [WS] __device__(Type v) { return v / WS; }); +} + +/** + * @brief Compute the column-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be column-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void colWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + // sum the weights & copy back to CPU + Type WS = 0; + raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false); + raft::update_host(&WS, mu, 1, stream); + + raft::linalg::stridedReduction( + mu, + data, + D, + N, + (Type)0, + stream, + false, + [weights] __device__(Type v, int i) { return v * weights[i]; }, + [] __device__(Type a, Type b) { return a + b; }, + [WS] __device__(Type v) { return v / WS; }); +} +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp new file mode 100644 index 0000000000..d4d3b449f7 --- /dev/null +++ b/cpp/include/raft/stats/histogram.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { + +/** + * @brief Perform histogram on the input data. It chooses the right load size + * based on the input data vector length. It also supports large-bin cases + * using a specialized smem-based hashing technique. + * @tparam DataT input data type + * @tparam IdxT data type used to compute indices + * @tparam BinnerOp takes the input data and computes its bin index + * @param type histogram implementation type to choose + * @param bins the output bins (length = ncols * nbins) + * @param nbins number of bins + * @param data input data (length = ncols * nrows) + * @param nrows data array length in each column (or batch) + * @param ncols number of columsn (or batch size) + * @param stream cuda stream + * @param binner the operation that computes the bin index of the input data + * + * @note signature of BinnerOp is `int func(DataT, IdxT);` + */ +template > +void histogram(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner = IdentityBinner()) +{ + detail::histogram(type, bins, nbins, data, nrows, ncols, stream, binner); +} + +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp new file mode 100644 index 0000000000..966287bb41 --- /dev/null +++ b/cpp/include/raft/stats/minmax.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace raft { +namespace stats { + +/** + * @brief Computes min/max across every column of the input matrix, as well as + * optionally allow to subsample based on the given row/col ID mapping vectors + * + * @tparam T the data type + * @tparam TPB number of threads per block + * @param data input data + * @param rowids actual row ID mappings. It is of length nrows. If you want to + * skip this index lookup entirely, pass nullptr + * @param colids actual col ID mappings. It is of length ncols. If you want to + * skip this index lookup entirely, pass nullptr + * @param nrows number of rows of data to be worked upon. The actual rows of the + * input "data" can be bigger than this! + * @param ncols number of cols of data to be worked upon. The actual cols of the + * input "data" can be bigger than this! + * @param row_stride stride (in number of elements) between 2 adjacent columns + * @param globalmin final col-wise global minimum (size = ncols) + * @param globalmax final col-wise global maximum (size = ncols) + * @param sampledcols output sampled data. Pass nullptr if you don't need this + * @param stream cuda stream + * @note This method makes the following assumptions: + * 1. input and output matrices are assumed to be col-major + * 2. ncols is small enough to fit the whole of min/max values across all cols + * in shared memory + */ +template +void minmax(const T* data, + const unsigned* rowids, + const unsigned* colids, + int nrows, + int ncols, + int row_stride, + T* globalmin, + T* globalmax, + T* sampledcols, + cudaStream_t stream) +{ + detail::minmax( + data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp new file mode 100644 index 0000000000..ad90142a08 --- /dev/null +++ b/cpp/include/raft/stats/weighted_mean.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace stats { + +/** + * @brief Compute the row-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be row-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void rowWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + detail::rowWeightedMean(mu, data, weights, D, N, stream); +} + +/** + * @brief Compute the column-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be column-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void colWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + detail::colWeightedMean(mu, data, weights, D, N, stream); +} +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 654ab73f84..430b69341c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -16,6 +16,7 @@ # keep the files in alphabetical order! add_executable(test_raft + test/common/seive.cu test/cudart_utils.cpp test/cluster_solvers.cu test/distance/dist_adj.cu @@ -106,14 +107,19 @@ add_executable(test_raft test/spatial/fused_l2_knn.cu test/spatial/haversine.cu test/spatial/ball_cover.cu + test/spatial/epsilon_neighborhood.cu test/spatial/faiss_mr.cu test/spatial/selection.cu test/spectral_matrix.cu + test/stats/cov.cu + test/stats/histogram.cu test/stats/mean.cu test/stats/meanvar.cu test/stats/mean_center.cu + test/stats/minmax.cu test/stats/stddev.cu test/stats/sum.cu + test/stats/weighted_mean.cu test/test.cpp ) diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu new file mode 100644 index 0000000000..8044dbb532 --- /dev/null +++ b/cpp/test/common/seive.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft { +namespace common { +TEST(Seive, Test) +{ + Seive s1(32); + ASSERT_TRUE(s1.isPrime(17)); + ASSERT_FALSE(s1.isPrime(28)); + + Seive s2(1024 * 1024); + ASSERT_TRUE(s2.isPrime(107)); + ASSERT_FALSE(s2.isPrime(111)); + ASSERT_TRUE(s2.isPrime(6047)); +} + +} // end namespace common +} // end namespace raft diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu index b8e44580b5..da38464bf7 100644 --- a/cpp/test/linalg/rsvd.cu +++ b/cpp/test/linalg/rsvd.cu @@ -31,6 +31,7 @@ struct RsvdInputs { T tolerance; int n_row; int n_col; + float redundancy; T PC_perc; T UpS_perc; int k; @@ -66,7 +67,7 @@ class RsvdTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); // rSVD seems to be very sensitive to the random number sequence as well! - raft::random::Rng r(params.seed, raft::random::GenTaps); + raft::random::Rng r(params.seed, raft::random::GenPC); int m = params.n_row, n = params.n_col; T eig_svd_tol = 1.e-7; int max_sweeps = 100; @@ -91,8 +92,19 @@ class RsvdTest : public ::testing::TestWithParam> { raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream); raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream); - } else { // Other normal tests - r.normal(A.data(), m * n, mu, sigma, stream); + } else { // Other normal tests + int n_informative = int(0.25f * n); // Informative cols + int len_informative = m * n_informative; + + int n_redundant = n - n_informative; // Redundant cols + int len_redundant = m * n_redundant; + + r.normal(A.data(), len_informative, mu, sigma, stream); + CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative, + A.data(), + len_redundant * sizeof(T), + cudaMemcpyDeviceToDevice, + stream)); } std::vector A_backup_cpu(m * n); // Backup A matrix as svdJacobi will destroy the content of A @@ -157,59 +169,65 @@ class RsvdTest : public ::testing::TestWithParam> { const std::vector> inputs_fx = { // Test with ratios - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT + {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + + {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT + + {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + + {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + + {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT + + {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + + {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT }; const std::vector> inputs_dx = { // Test with ratios - {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT + {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT }; const std::vector> sanity_inputs_fx = { - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; + {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; const std::vector> sanity_inputs_dx = { - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}}; + {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, false, 4321ULL}}; typedef RsvdTest RsvdSanityCheckValF; TEST_P(RsvdSanityCheckValF, Result) diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu index 8c7e440d0e..caad627d49 100644 --- a/cpp/test/random/make_blobs.cu +++ b/cpp/test/random/make_blobs.cu @@ -21,7 +21,8 @@ #include #include -namespace raft::random { +namespace raft { +namespace random { template __global__ void meanKernel(T* out, @@ -136,8 +137,8 @@ class MakeBlobsTest : public ::testing::TestWithParam> { { int len = params.n_clusters * params.cols; auto compare = raft::CompareApprox(num_sigma * params.tolerance); - ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream)); - ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream)); + ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare)); + ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare)); } protected: @@ -153,53 +154,37 @@ typedef MakeBlobsTest MakeBlobsTestF; const std::vector> inputsf_t = { {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - + {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, + + {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL}, }; TEST_P(MakeBlobsTestF, Result) { check(); } @@ -209,55 +194,40 @@ typedef MakeBlobsTest MakeBlobsTestD; const std::vector> inputsd_t = { {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL}, }; TEST_P(MakeBlobsTestD, Result) { check(); } INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t)); -} // end namespace raft::random \ No newline at end of file +} // end namespace random +} // end namespace raft \ No newline at end of file diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu new file mode 100644 index 0000000000..33af5726a0 --- /dev/null +++ b/cpp/test/spatial/epsilon_neighborhood.cu @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace spatial { +namespace knn { +template +struct EpsInputs { + IdxT n_row, n_col, n_centers, n_batches; + T eps; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const EpsInputs& p) +{ + return os; +} + +template +class EpsNeighTest : public ::testing::TestWithParam> { + protected: + EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {} + + void SetUp() override + { + param = ::testing::TestWithParam>::GetParam(); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + data.resize(param.n_row * param.n_col, stream); + labels.resize(param.n_row, stream); + batchSize = param.n_row / param.n_batches; + adj.resize(param.n_row * batchSize, stream); + vd.resize(batchSize + 1, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream)); + random::make_blobs(data.data(), + labels.data(), + param.n_row, + param.n_col, + param.n_centers, + stream, + true, + nullptr, + nullptr, + T(0.01), + false); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + EpsInputs param; + cudaStream_t stream = 0; + rmm::device_uvector data; + rmm::device_uvector adj; + rmm::device_uvector labels, vd; + IdxT batchSize; +}; // class EpsNeighTest + +const std::vector> inputsfi = { + {15000, 16, 5, 1, 2.f}, + {14000, 16, 5, 1, 2.f}, + {15000, 17, 5, 1, 2.f}, + {14000, 17, 5, 1, 2.f}, + {15000, 18, 5, 1, 2.f}, + {14000, 18, 5, 1, 2.f}, + {15000, 32, 5, 1, 2.f}, + {14000, 32, 5, 1, 2.f}, + {20000, 10000, 10, 1, 2.f}, + {20000, 10000, 10, 2, 2.f}, +}; +typedef EpsNeighTest EpsNeighTestFI; +TEST_P(EpsNeighTestFI, Result) +{ + for (int i = 0; i < param.n_batches; ++i) { + RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream)); + epsUnexpL2SqNeighborhood(adj. + + data(), + vd + + . + + data(), + data + + . + + data(), + data + + . + + data() + + + (i * batchSize * param.n_col), + param.n_row, + batchSize, + param.n_col, + param.eps * param.eps, + stream); + ASSERT_TRUE(raft::devArrMatch( + param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare(), stream)); + } +} +INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi)); + +}; // namespace knn +}; // namespace spatial +}; // namespace raft diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu new file mode 100644 index 0000000000..2db64a7999 --- /dev/null +++ b/cpp/test/stats/cov.cu @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +template +struct CovInputs { + T tolerance, mean, var; + int rows, cols; + bool sample, rowMajor, stable; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const CovInputs& dims) +{ + return os; +} + +template +class CovTest : public ::testing::TestWithParam> { + protected: + CovTest() + : data(0, stream), + mean_act(0, stream), + cov_act(0, stream), + cov_cm(0, stream), + cov_cm_ref(0, stream) + { + } + + void SetUp() override + { + raft::handle_t handle; + cudaStream_t stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + params.tolerance *= 2; + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + auto len = rows * cols; + T var = params.var; + data.resize(len, stream); + mean_act.resize(cols, stream); + cov_act.resize(cols * cols, stream); + + r.normal(data.data(), len, params.mean, var, stream); + raft::stats::mean( + mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + cov(handle, + cov_act.data(), + data.data(), + mean_act.data(), + cols, + rows, + params.sample, + params.rowMajor, + params.stable, + stream); + + T data_h[6] = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0}; + T cov_cm_ref_h[4] = {4.3333, -2.8333, -2.8333, 2.333}; + + cov_cm.resize(4, stream); + cov_cm_ref.resize(4, stream); + rmm::device_uvector data_cm(6, stream); + rmm::device_uvector mean_cm(2, stream); + + raft::update_device(data_cm.data(), data_h, 6, stream); + raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream); + + raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream); + cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream); + } + + protected: + CovInputs params; + rmm::device_uvector data, mean_act, cov_act, cov_cm, cov_cm_ref; + cublasHandle_t handle; + cudaStream_t stream = 0; +}; + +///@todo: add stable=false after it has been implemented +const std::vector> inputsf = { + {0.03f, 1.f, 2.f, 32 * 1024, 32, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 64, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 128, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 256, true, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 32, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 64, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 128, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 256, false, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 32, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 64, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 128, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 256, true, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 32, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 64, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 128, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 256, false, true, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.03, 1.0, 2.0, 32 * 1024, 32, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 64, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 128, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 256, true, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 32, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 64, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 128, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 256, false, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 32, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 64, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 128, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 256, true, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 32, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 64, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 128, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 256, false, true, true, 1234ULL}}; + +typedef CovTest CovTestF; +TEST_P(CovTestF, Result) +{ + ASSERT_TRUE(raft::diagonalMatch(params.var * params.var, + cov_act.data(), + params.cols, + params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestD; +TEST_P(CovTestD, Result) +{ + ASSERT_TRUE(raft::diagonalMatch(params.var * params.var, + cov_act.data(), + params.cols, + params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestSmallF; +TEST_P(CovTestSmallF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestSmallD; +TEST_P(CovTestSmallD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestD, ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallD, ::testing::ValuesIn(inputsd)); + +} // namespace stats +} // namespace raft diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu new file mode 100644 index 0000000000..ff538fcdca --- /dev/null +++ b/cpp/test/stats/histogram.cu @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +// Note: this kernel also updates the input vector to take care of OOB bins! +__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + auto offset = blockIdx.y * nrows; + auto binOffset = blockIdx.y * nbins; + for (; tid < nrows; tid += stride) { + int id = in[offset + tid]; + if (id < 0) + id = 0; + else if (id >= nbins) + id = nbins - 1; + in[offset + tid] = id; + raft::myAtomicAdd(bins + binOffset + id, 1); + } +} + +void naiveHist(int* bins, int nbins, int* in, int nrows, int ncols, cudaStream_t stream) +{ + const int TPB = 128; + int nblksx = raft::ceildiv(nrows, TPB); + dim3 blks(nblksx, ncols); + naiveHistKernel<<>>(bins, nbins, in, nrows); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +struct HistInputs { + int nrows, ncols, nbins; + bool isNormal; + HistType type; + int start, end; + unsigned long long int seed; +}; + +class HistTest : public ::testing::TestWithParam { + protected: + HistTest() : in(0, stream), bins(0, stream), ref_bins(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + int len = params.nrows * params.ncols; + in.resize(len, stream); + if (params.isNormal) { + r.normalInt(in.data(), len, params.start, params.end, stream); + } else { + r.uniformInt(in.data(), len, params.start, params.end, stream); + } + bins.resize(params.nbins * params.ncols, stream); + ref_bins.resize(params.nbins * params.ncols, stream); + RAFT_CUDA_TRY( + cudaMemsetAsync(ref_bins.data(), 0, sizeof(int) * params.nbins * params.ncols, stream)); + naiveHist(ref_bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream); + histogram( + params.type, bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream); + raft::interruptible::synchronize(stream); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + protected: + cudaStream_t stream = 0; + HistInputs params; + rmm::device_uvector in, bins, ref_bins; +}; + +static const int oneK = 1024; +static const int oneM = oneK * oneK; +const std::vector inputs = { + {oneM, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, +}; +TEST_P(HistTest, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + ref_bins.data(), bins.data(), params.nbins * params.ncols, raft::Compare())); +} +INSTANTIATE_TEST_CASE_P(HistTests, HistTest, ::testing::ValuesIn(inputs)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu new file mode 100644 index 0000000000..61b16b65ae --- /dev/null +++ b/cpp/test/stats/minmax.cu @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +///@todo: need to add tests for verifying the column subsampling feature + +template +struct MinMaxInputs { + T tolerance; + int rows, cols; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const MinMaxInputs& dims) +{ + return os; +} + +template +__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= ncols) return; + globalmin[tid] = init_val; + globalmax[tid] = -init_val; +} + +template +__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int col = tid / nrows; + if (col < ncols) { + T val = data[tid]; + if (!isnan(val)) { + raft::myAtomicMin(&globalmin[col], val); + raft::myAtomicMax(&globalmax[col], val); + } + } +} + +template +void naiveMinMax( + const T* data, int nrows, int ncols, T* globalmin, T* globalmax, cudaStream_t stream) +{ + const int TPB = 128; + int nblks = raft::ceildiv(ncols, TPB); + T init_val = std::numeric_limits::max(); + naiveMinMaxInitKernel<<>>(ncols, globalmin, globalmax, init_val); + RAFT_CUDA_TRY(cudaGetLastError()); + nblks = raft::ceildiv(nrows * ncols, TPB); + naiveMinMaxKernel<<>>(data, nrows, ncols, globalmin, globalmax); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +__global__ void nanKernel(T* data, const bool* mask, int len, T nan) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= len) return; + if (!mask[tid]) data[tid] = nan; +} + +template +class MinMaxTest : public ::testing::TestWithParam> { + protected: + MinMaxTest() : minmax_act(0, stream), minmax_ref(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.rows * params.cols; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + rmm::device_uvector data(len, stream); + rmm::device_uvector mask(len, stream); + minmax_act.resize(2 * params.cols, stream); + minmax_ref.resize(2 * params.cols, stream); + + r.normal(data.data(), len, (T)0.0, (T)1.0, stream); + T nan_prob = 0.01; + r.bernoulli(mask.data(), len, nan_prob, stream); + const int TPB = 256; + nanKernel<<>>( + data.data(), mask.data(), len, std::numeric_limits::quiet_NaN()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + naiveMinMax(data.data(), + params.rows, + params.cols, + minmax_ref.data(), + minmax_ref.data() + params.cols, + stream); + minmax(data.data(), + nullptr, + nullptr, + params.rows, + params.cols, + params.rows, + minmax_act.data(), + minmax_act.data() + params.cols, + nullptr, + stream); + } + + protected: + MinMaxInputs params; + rmm::device_uvector minmax_act; + rmm::device_uvector minmax_ref; + cudaStream_t stream = 0; +}; + +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}, + {0.00001f, 1024, 512, 1234ULL}, + {0.00001f, 1024, 1024, 1234ULL}, + {0.00001f, 4096, 32, 1234ULL}, + {0.00001f, 4096, 64, 1234ULL}, + {0.00001f, 4096, 128, 1234ULL}, + {0.00001f, 4096, 256, 1234ULL}, + {0.00001f, 4096, 512, 1234ULL}, + {0.00001f, 4096, 1024, 1234ULL}, + {0.00001f, 8192, 32, 1234ULL}, + {0.00001f, 8192, 64, 1234ULL}, + {0.00001f, 8192, 128, 1234ULL}, + {0.00001f, 8192, 256, 1234ULL}, + {0.00001f, 8192, 512, 1234ULL}, + {0.00001f, 8192, 1024, 1234ULL}, + {0.00001f, 1024, 8192, 1234ULL}}; + +const std::vector> inputsd = {{0.0000001, 1024, 32, 1234ULL}, + {0.0000001, 1024, 64, 1234ULL}, + {0.0000001, 1024, 128, 1234ULL}, + {0.0000001, 1024, 256, 1234ULL}, + {0.0000001, 1024, 512, 1234ULL}, + {0.0000001, 1024, 1024, 1234ULL}, + {0.0000001, 4096, 32, 1234ULL}, + {0.0000001, 4096, 64, 1234ULL}, + {0.0000001, 4096, 128, 1234ULL}, + {0.0000001, 4096, 256, 1234ULL}, + {0.0000001, 4096, 512, 1234ULL}, + {0.0000001, 4096, 1024, 1234ULL}, + {0.0000001, 8192, 32, 1234ULL}, + {0.0000001, 8192, 64, 1234ULL}, + {0.0000001, 8192, 128, 1234ULL}, + {0.0000001, 8192, 256, 1234ULL}, + {0.0000001, 8192, 512, 1234ULL}, + {0.0000001, 8192, 1024, 1234ULL}, + {0.0000001, 1024, 8192, 1234ULL}}; + +typedef MinMaxTest MinMaxTestF; +TEST_P(MinMaxTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(), + minmax_act.data(), + 2 * params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef MinMaxTest MinMaxTestD; +TEST_P(MinMaxTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(), + minmax_act.data(), + 2 * params.cols, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu new file mode 100644 index 0000000000..ee58747b69 --- /dev/null +++ b/cpp/test/stats/weighted_mean.cu @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +template +struct WeightedMeanInputs { + T tolerance; + int M, N; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs& I) +{ + return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}" + << std::endl; +} + +///// weighted row-wise mean test and support functions +template +void naiveRowWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor) +{ + int istr = rowMajor ? 1 : M; + int jstr = rowMajor ? N : 1; + + // sum the weights + T WS = 0; + for (int i = 0; i < N; i++) + WS += W[i]; + + for (int j = 0; j < M; j++) { + R[j] = (T)0; + for (int i = 0; i < N; i++) { + // R[j] += (W[i]*D[i*istr + j*jstr] - R[j])/(T)(i+1); + R[j] += (W[i] * D[i * istr + j * jstr]) / WS; + } + } +} + +template +class RowWeightedMeanTest : public ::testing::TestWithParam> { + protected: + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.M, cols = params.N, len = rows * cols; + cudaStream_t stream = 0; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + // device-side data + din.resize(len); + dweights.resize(cols); + dexp.resize(rows); + dact.resize(rows); + + // create random matrix and weights + r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream); + r.uniform(dweights.data().get(), cols, T(-1.0), T(1.0), stream); + + // host-side data + thrust::host_vector hin = din; + thrust::host_vector hweights = dweights; + thrust::host_vector hexp(rows); + + // compute naive result & copy to GPU + naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true); + dexp = hexp; + + // compute ml-prims result + rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream); + + // adjust tolerance to account for round-off accumulation + params.tolerance *= params.N; + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + void TearDown() override {} + + protected: + WeightedMeanInputs params; + thrust::host_vector hin, hweights; + thrust::device_vector din, dweights, dexp, dact; +}; + +///// weighted column-wise mean test and support functions +template +void naiveColWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor) +{ + int istr = rowMajor ? 1 : M; + int jstr = rowMajor ? N : 1; + + // sum the weights + T WS = 0; + for (int j = 0; j < M; j++) + WS += W[j]; + + for (int i = 0; i < N; i++) { + R[i] = (T)0; + for (int j = 0; j < M; j++) { + // R[i] += (W[j]*D[i*istr + j*jstr] - R[i])/(T)(j+1); + R[i] += (W[j] * D[i * istr + j * jstr]) / WS; + } + } +} + +template +class ColWeightedMeanTest : public ::testing::TestWithParam> { + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.M, cols = params.N, len = rows * cols; + + cudaStream_t stream = 0; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + // device-side data + din.resize(len); + dweights.resize(rows); + dexp.resize(cols); + dact.resize(cols); + + // create random matrix and weights + r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream); + r.uniform(dweights.data().get(), rows, T(-1.0), T(1.0), stream); + + // host-side data + thrust::host_vector hin = din; + thrust::host_vector hweights = dweights; + thrust::host_vector hexp(cols); + + // compute naive result & copy to GPU + naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true); + dexp = hexp; + + // compute ml-prims result + colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream); + + // adjust tolerance to account for round-off accumulation + params.tolerance *= params.M; + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + void TearDown() override {} + + protected: + WeightedMeanInputs params; + thrust::host_vector hin, hweights; + thrust::device_vector din, dweights, dexp, dact; +}; + +////// Parameter sets and test instantiation +static const float tolF = 128 * std::numeric_limits::epsilon(); +static const double tolD = 256 * std::numeric_limits::epsilon(); + +const std::vector> inputsf = {{tolF, 4, 4, 1234}, + {tolF, 1024, 32, 1234}, + {tolF, 1024, 64, 1234}, + {tolF, 1024, 128, 1234}, + {tolF, 1024, 256, 1234}, + {tolF, 1024, 32, 1234}, + {tolF, 1024, 64, 1234}, + {tolF, 1024, 128, 1234}, + {tolF, 1024, 256, 1234}}; + +const std::vector> inputsd = {{tolD, 4, 4, 1234}, + {tolD, 1024, 32, 1234}, + {tolD, 1024, 64, 1234}, + {tolD, 1024, 128, 1234}, + {tolD, 1024, 256, 1234}, + {tolD, 1024, 32, 1234}, + {tolD, 1024, 64, 1234}, + {tolD, 1024, 128, 1234}, + {tolD, 1024, 256, 1234}}; + +using RowWeightedMeanTestF = RowWeightedMeanTest; +TEST_P(RowWeightedMeanTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestF, ::testing::ValuesIn(inputsf)); + +using RowWeightedMeanTestD = RowWeightedMeanTest; +TEST_P(RowWeightedMeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestD, ::testing::ValuesIn(inputsd)); + +using ColWeightedMeanTestF = ColWeightedMeanTest; +TEST_P(ColWeightedMeanTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestF, ::testing::ValuesIn(inputsf)); + +using ColWeightedMeanTestD = ColWeightedMeanTest; +TEST_P(ColWeightedMeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd)); + +}; // end namespace stats +}; // end namespace raft