diff --git a/CHANGELOG.md b/CHANGELOG.md index f7ea434693..b13ec307c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # RAFT 0.17.0 (Date TBD) ## New Features +- PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects ## Improvements - PR #73: Move DistanceType enum from cuML to RAFT diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3de222928..cbe96454f5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -235,8 +235,34 @@ if(BUILD_RAFT_TESTS) test/cudart_utils.cpp test/handle.cpp test/integer_utils.cpp + test/linalg/add.cu + test/linalg/binary_op.cu + test/linalg/coalesced_reduction.cu + test/linalg/divide.cu + test/linalg/eig.cu + test/linalg/eig_sel.cu + test/linalg/gemm_layout.cu + test/linalg/map_then_reduce.cu + test/linalg/matrix_vector_op.cu + test/linalg/multiply.cu + test/linalg/norm.cu + test/linalg/reduce.cu + test/linalg/strided_reduction.cu + test/linalg/subtract.cu + test/linalg/svd.cu + test/linalg/transpose.cu + test/linalg/unary_op.cu + test/matrix/math.cu + test/matrix/matrix.cu test/mr/device/buffer.cpp test/mr/host/buffer.cpp + test/random/rng.cu + test/random/rng_int.cu + test/random/sample_without_replacement.cu + test/stats/mean.cu + test/stats/mean_center.cu + test/stats/stddev.cu + test/stats/sum.cu test/test.cpp test/spectral_matrix.cu test/eigen_solvers.cu diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh new file mode 100644 index 0000000000..8d5b29f700 --- /dev/null +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + +/** + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ +template +void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, + KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, + cudaStream_t stream) { + size_t worksize; + cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, + outVals, len, 0, sizeof(KeyT) * 8, stream); + workspace.resize(worksize, stream); + cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, + inVals, outVals, len, 0, sizeof(KeyT) * 8, + stream); +} + +} // namespace raft diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh new file mode 100644 index 0000000000..785794461e --- /dev/null +++ b/cpp/include/raft/common/scatter.cuh @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + +template +__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, + IdxT len, Lambda op) { + typedef TxN_t DataVec; + typedef TxN_t IdxVec; + IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); + tid *= VecLen; + if (tid >= len) return; + IdxVec idxIn; + idxIn.load(idx, tid); + DataVec dataIn; +#pragma unroll + for (int i = 0; i < VecLen; ++i) { + auto inPos = idxIn.val.data[i]; + dataIn.val.data[i] = op(in[inPos], tid + i); + } + dataIn.store(out, tid); +} + +template +void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, + Lambda op, cudaStream_t stream) { + const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); + scatterKernel + <<>>(out, in, idx, len, op); + CUDA_CHECK(cudaGetLastError()); +} + +/** + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, + cudaStream_t stream, Lambda op = raft::Nop()) { + if (len <= 0) return; + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; + size_t bytes = len * MaxPerElem; + if (16 / MaxPerElem && bytes % 16 == 0) { + scatterImpl(out, in, idx, len, + op, stream); + } else if (8 / MaxPerElem && bytes % 8 == 0) { + scatterImpl(out, in, idx, len, op, + stream); + } else if (4 / MaxPerElem && bytes % 4 == 0) { + scatterImpl(out, in, idx, len, op, + stream); + } else if (2 / MaxPerElem && bytes % 2 == 0) { + scatterImpl(out, in, idx, len, op, + stream); + } else if (1 / MaxPerElem) { + scatterImpl(out, in, idx, len, op, + stream); + } else { + scatterImpl(out, in, idx, len, op, stream); + } +} + +} // namespace raft diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh new file mode 100644 index 0000000000..696b3ec662 --- /dev/null +++ b/cpp/include/raft/cuda_utils.cuh @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#ifndef ENABLE_MEMCPY_ASYNC +// enable memcpy_async interface by default for newer GPUs +#if __CUDA_ARCH__ >= 800 +#define ENABLE_MEMCPY_ASYNC 1 +#endif +#else // ENABLE_MEMCPY_ASYNC +// disable memcpy_async for all older GPUs +#if __CUDA_ARCH__ < 800 +#define ENABLE_MEMCPY_ASYNC 0 +#endif +#endif // ENABLE_MEMCPY_ASYNC + +namespace raft { + +/** helper macro for device inlined functions */ +#define DI inline __device__ +#define HDI inline __host__ __device__ +#define HD __host__ __device__ + +/** + * @brief Provide a ceiling division operation ie. ceil(a / b) + * @tparam IntType supposed to be only integers for now! + */ +template +constexpr HDI IntType ceildiv(IntType a, IntType b) { + return (a + b - 1) / b; +} + +/** + * @brief Provide an alignment function ie. ceil(a / b) * b + * @tparam IntType supposed to be only integers for now! + */ +template +constexpr HDI IntType alignTo(IntType a, IntType b) { + return ceildiv(a, b) * b; +} + +/** + * @brief Provide an alignment function ie. (a / b) * b + * @tparam IntType supposed to be only integers for now! + */ +template +constexpr HDI IntType alignDown(IntType a, IntType b) { + return (a / b) * b; +} + +/** + * @brief Check if the input is a power of 2 + * @tparam IntType data type (checked only for integers) + */ +template +constexpr HDI bool isPo2(IntType num) { + return (num && !(num & (num - 1))); +} + +/** + * @brief Give logarithm of the number to base-2 + * @tparam IntType data type (checked only for integers) + */ +template +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { + return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); +} + +/** Device function to apply the input lambda across threads in the grid */ +template +DI void forEach(int num, L lambda) { + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; + const int numThreads = blockDim.x * gridDim.x; +#pragma unroll + for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { + if (idx < num) lambda(idx, itr); + } +} + +/** number of threads per warp */ +static const int WarpSize = 32; + +/** get the laneId of the current thread */ +DI int laneId() { + int id; + asm("mov.s32 %0, %laneid;" : "=r"(id)); + return id; +} + +/** + * @brief Swap two values + * @tparam T the datatype of the values + * @param a first input + * @param b second input + */ +template +HDI void swapVals(T &a, T &b) { + T tmp = a; + a = b; + b = tmp; +} + +/** Device function to have atomic add support for older archs */ +template +DI void myAtomicAdd(Type *address, Type val) { + atomicAdd(address, val); +} + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) +// Ref: +// http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf +template <> +DI void myAtomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); +} +#endif + +template +DI void myAtomicReduce(T *address, T val, ReduceLambda op); + +template +DI void myAtomicReduce(double *address, double val, ReduceLambda op) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = + atomicCAS(address_as_ull, assumed, + __double_as_longlong(op(val, __longlong_as_double(assumed)))); + } while (assumed != old); +} + +template +DI void myAtomicReduce(float *address, float val, ReduceLambda op) { + unsigned int *address_as_uint = (unsigned int *)address; + unsigned int old = *address_as_uint, assumed; + do { + assumed = old; + old = atomicCAS(address_as_uint, assumed, + __float_as_uint(op(val, __uint_as_float(assumed)))); + } while (assumed != old); +} + +template +DI void myAtomicReduce(int *address, int val, ReduceLambda op) { + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, op(val, assumed)); + } while (assumed != old); +} + +template +DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { + long long old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, op(val, assumed)); + } while (assumed != old); +} + +template +DI void myAtomicReduce(unsigned long long *address, unsigned long long val, + ReduceLambda op) { + unsigned long long old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, op(val, assumed)); + } while (assumed != old); +} + +/** + * @brief Provide atomic min operation. + * @tparam T: data type for input data (float or double). + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] val: new value to compare with old + */ +template +DI T myAtomicMin(T *address, T val); + +/** + * @brief Provide atomic max operation. + * @tparam T: data type for input data (float or double). + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] val: new value to compare with old + */ +template +DI T myAtomicMax(T *address, T val); + +DI float myAtomicMin(float *address, float val) { + myAtomicReduce(address, val, fminf); + return *address; +} + +DI float myAtomicMax(float *address, float val) { + myAtomicReduce(address, val, fmaxf); + return *address; +} + +DI double myAtomicMin(double *address, double val) { + myAtomicReduce(address, val, fmin); + return *address; +} + +DI double myAtomicMax(double *address, double val) { + myAtomicReduce(address, val, fmax); + return *address; +} + +/** + * @defgroup Max maximum of two numbers + * @{ + */ +template +HDI T myMax(T x, T y); +template <> +HDI float myMax(float x, float y) { + return fmaxf(x, y); +} +template <> +HDI double myMax(double x, double y) { + return fmax(x, y); +} +/** @} */ + +/** + * @defgroup Min minimum of two numbers + * @{ + */ +template +HDI T myMin(T x, T y); +template <> +HDI float myMin(float x, float y) { + return fminf(x, y); +} +template <> +HDI double myMin(double x, double y) { + return fmin(x, y); +} +/** @} */ + +/** + * @brief Provide atomic min operation. + * @tparam T: data type for input data (float or double). + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] val: new value to compare with old + */ +template +DI T myAtomicMin(T *address, T val) { + myAtomicReduce(address, val, myMin); + return *address; +} + +/** + * @brief Provide atomic max operation. + * @tparam T: data type for input data (float or double). + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] val: new value to compare with old + */ +template +DI T myAtomicMax(T *address, T val) { + myAtomicReduce(address, val, myMax); + return *address; +} + +/** + * Sign function + */ +template +HDI int sgn(const T val) { + return (T(0) < val) - (val < T(0)); +} + +/** + * @defgroup Exp Exponential function + * @{ + */ +template +HDI T myExp(T x); +template <> +HDI float myExp(float x) { + return expf(x); +} +template <> +HDI double myExp(double x) { + return exp(x); +} +/** @} */ + +/** + * @defgroup Cuda infinity values + * @{ + */ +template +inline __device__ T myInf(); +template <> +inline __device__ float myInf() { + return CUDART_INF_F; +} +template <> +inline __device__ double myInf() { + return CUDART_INF; +} +/** @} */ + +/** + * @defgroup Log Natural logarithm + * @{ + */ +template +HDI T myLog(T x); +template <> +HDI float myLog(float x) { + return logf(x); +} +template <> +HDI double myLog(double x) { + return log(x); +} +/** @} */ + +/** + * @defgroup Sqrt Square root + * @{ + */ +template +HDI T mySqrt(T x); +template <> +HDI float mySqrt(float x) { + return sqrtf(x); +} +template <> +HDI double mySqrt(double x) { + return sqrt(x); +} +/** @} */ + +/** + * @defgroup SineCosine Sine and cosine calculation + * @{ + */ +template +DI void mySinCos(T x, T &s, T &c); +template <> +DI void mySinCos(float x, float &s, float &c) { + sincosf(x, &s, &c); +} +template <> +DI void mySinCos(double x, double &s, double &c) { + sincos(x, &s, &c); +} +/** @} */ + +/** + * @defgroup Sine Sine calculation + * @{ + */ +template +DI T mySin(T x); +template <> +DI float mySin(float x) { + return sinf(x); +} +template <> +DI double mySin(double x) { + return sin(x); +} +/** @} */ + +/** + * @defgroup Abs Absolute value + * @{ + */ +template +DI T myAbs(T x) { + return x < 0 ? -x : x; +} +template <> +DI float myAbs(float x) { + return fabsf(x); +} +template <> +DI double myAbs(double x) { + return fabs(x); +} +/** @} */ + +/** + * @defgroup Pow Power function + * @{ + */ +template +HDI T myPow(T x, T power); +template <> +HDI float myPow(float x, float power) { + return powf(x, power); +} +template <> +HDI double myPow(double x, double power) { + return pow(x, power); +} +/** @} */ + +/** + * @defgroup myTanh tanh function + * @{ + */ +template +HDI T myTanh(T x); +template <> +HDI float myTanh(float x) { + return tanhf(x); +} +template <> +HDI double myTanh(double x) { + return tanh(x); +} +/** @} */ + +/** + * @defgroup myATanh arctanh function + * @{ + */ +template +HDI T myATanh(T x); +template <> +HDI float myATanh(float x) { + return atanhf(x); +} +template <> +HDI double myATanh(double x) { + return atanh(x); +} +/** @} */ + +/** + * @defgroup LambdaOps Lambda operations in reduction kernels + * @{ + */ +// IdxType mostly to be used for MainLambda in *Reduction kernels +template +struct Nop { + HDI Type operator()(Type in, IdxType i = 0) { return in; } +}; + +template +struct L1Op { + HDI Type operator()(Type in, IdxType i = 0) { return myAbs(in); } +}; + +template +struct L2Op { + HDI Type operator()(Type in, IdxType i = 0) { return in * in; } +}; + +template +struct Sum { + HDI Type operator()(Type a, Type b) { return a + b; } +}; +/** @} */ + +/** + * @defgroup Sign Obtain sign value + * @brief Obtain sign of x + * @param x input + * @return +1 if x >= 0 and -1 otherwise + * @{ + */ +template +DI T signPrim(T x) { + return x < 0 ? -1 : +1; +} +template <> +DI float signPrim(float x) { + return signbit(x) == true ? -1.0f : +1.0f; +} +template <> +DI double signPrim(double x) { + return signbit(x) == true ? -1.0 : +1.0; +} +/** @} */ + +/** + * @defgroup Max maximum of two numbers + * @brief Obtain maximum of two values + * @param x one item + * @param y second item + * @return maximum of two items + * @{ + */ +template +DI T maxPrim(T x, T y) { + return x > y ? x : y; +} +template <> +DI float maxPrim(float x, float y) { + return fmaxf(x, y); +} +template <> +DI double maxPrim(double x, double y) { + return fmax(x, y); +} +/** @} */ + +/** apply a warp-wide fence (useful from Volta+ archs) */ +DI void warpFence() { +#if __CUDA_ARCH__ >= 700 + __syncwarp(); +#endif +} + +/** warp-wide any boolean aggregator */ +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { +#if CUDART_VERSION >= 9000 + inFlag = __any_sync(mask, inFlag); +#else + inFlag = __any(inFlag); +#endif + return inFlag; +} + +/** warp-wide all boolean aggregator */ +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { +#if CUDART_VERSION >= 9000 + inFlag = __all_sync(mask, inFlag); +#else + inFlag = __all(inFlag); +#endif + return inFlag; +} + +/** + * @brief Shuffle the data inside a warp + * @tparam T the data type (currently assumed to be 4B) + * @param val value to be shuffled + * @param srcLane lane from where to shuffle + * @param width lane width + * @param mask mask of participating threads (Volta+) + * @return the shuffled data + */ +template +DI T shfl(T val, int srcLane, int width = WarpSize, + uint32_t mask = 0xffffffffu) { +#if CUDART_VERSION >= 9000 + return __shfl_sync(mask, val, srcLane, width); +#else + return __shfl(val, srcLane, width); +#endif +} + +/** + * @brief Shuffle the data inside a warp + * @tparam T the data type (currently assumed to be 4B) + * @param val value to be shuffled + * @param laneMask mask to be applied in order to perform xor shuffle + * @param width lane width + * @param mask mask of participating threads (Volta+) + * @return the shuffled data + */ +template +DI T shfl_xor(T val, int laneMask, int width = WarpSize, + uint32_t mask = 0xffffffffu) { +#if CUDART_VERSION >= 9000 + return __shfl_xor_sync(mask, val, laneMask, width); +#else + return __shfl_xor(val, laneMask, width); +#endif +} + +/** + * @brief Warp-level sum reduction + * @param val input value + * @return only the lane0 will contain valid reduced result + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block. All threads in the warp must enter this + * function together + * @todo Expand this to support arbitrary reduction ops + */ +template +DI T warpReduce(T val) { +#pragma unroll + for (int i = WarpSize / 2; i > 0; i >>= 1) { + T tmp = shfl(val, laneId() + i); + val += tmp; + } + return val; +} + +/** + * @brief 1-D block-level sum reduction + * @param val input value + * @param smem shared memory region needed for storing intermediate results. It + * must alteast be of size: `sizeof(T) * nWarps` + * @return only the thread0 will contain valid reduced result + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block. All threads in the block must enter this + * function together + * @todo Expand this to support arbitrary reduction ops + */ +template +DI T blockReduce(T val, char *smem) { + auto *sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); + if (lid == 0) sTemp[wid] = val; + __syncthreads(); + val = lid < nWarps ? sTemp[lid] : T(0); + return warpReduce(val); +} + +/** + * @brief Simple utility function to determine whether user_stream or one of the + * internal streams should be used. + * @param user_stream main user stream + * @param int_streams array of internal streams + * @param n_int_streams number of internal streams + * @param idx the index for which to query the stream + */ +inline cudaStream_t select_stream(cudaStream_t user_stream, + cudaStream_t *int_streams, int n_int_streams, + int idx) { + return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; +} + +} // namespace raft diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index b4713b9d53..86c60addf2 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -21,7 +21,10 @@ #include #include +#include #include +#include +#include ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" @@ -256,4 +259,107 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ +/** cuda malloc */ +template +void allocate(Type*& ptr, size_t len, bool setZero = false) { + CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); + if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); +} + +/** helper method to get max usable shared mem per block parameter */ +inline int getSharedMemPerBlock() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int smemPerBlk; + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, + cudaDevAttrMaxSharedMemoryPerBlock, devId)); + return smemPerBlk; +} + +/** helper method to get multi-processor count parameter */ +inline int getMultiProcessorCount() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int mpCount; + CUDA_CHECK( + cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + return mpCount; +} + +/** helper method to convert an array on device to a string on host */ +template +std::string arr2Str(const T* arr, int size, std::string name, + cudaStream_t stream, int width = 4) { + std::stringstream ss; + + T* arr_h = (T*)malloc(size * sizeof(T)); + update_host(arr_h, arr, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + ss << name << " = [ "; + for (int i = 0; i < size; i++) { + ss << std::setw(width) << arr_h[i]; + + if (i < size - 1) ss << ", "; + } + ss << " ]" << std::endl; + + free(arr_h); + + return ss.str(); +} + +/** this seems to be unused, but may be useful in the future */ +template +void ASSERT_DEVICE_MEM(T* ptr, std::string name) { + cudaPointerAttributes s_att; + cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); + + if (s_err != 0 || s_att.device == -1) + std::cout << "Invalid device pointer encountered in " << name + << ". device=" << s_att.device << ", err=" << s_err << std::endl; +} + +inline uint32_t curTimeMillis() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + return std::chrono::duration_cast(duration) + .count(); +} + +/** Helper function to calculate need memory for allocate to store dense matrix. + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { + return rows * columns; +} + +/** Helper function to check alignment of pointer. + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ +template +bool is_aligned(Type* ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +/** calculate greatest common divisor of two numbers +* @a integer +* @b integer +* @ return gcd of a and b +*/ +template +IntType gcd(IntType a, IntType b) { + while (b != 0) { + IntType tmp = b; + b = a % b; + a = tmp; + } + return a; +} + } // namespace raft diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh new file mode 100644 index 0000000000..7a454f64e2 --- /dev/null +++ b/cpp/include/raft/linalg/add.cuh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "binary_op.cuh" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief Elementwise scalar add operation on the input buffer + * + * @tparam InT input data-type. Also the data-type upon which the math ops + * will be performed + * @tparam OutT output data-type + * @tparam IdxType Integer type used to for addressing + * + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + */ +template +void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, + cudaStream_t stream) { + auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; + unaryOp(out, in, len, op, stream); +} + +/** + * @brief Elementwise add operation on the input buffers + * @tparam InT input data-type. Also the data-type upon which the math ops + * will be performed + * @tparam OutT output data-type + * @tparam IdxType Integer type used to for addressing + * + * @param out the output buffer + * @param in1 the first input buffer + * @param in2 the second input buffer + * @param len number of elements in the input buffers + * @param stream cuda stream where to launch work + */ +template +void add(OutT *out, const InT *in1, const InT *in2, IdxType len, + cudaStream_t stream) { + auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; + binaryOp(out, in1, in2, len, op, stream); +} + +template +__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { + IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { + outDev[i] = inDev[i] + *singleScalarDev; + } +} + +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param outDev the output buffer + * @param inDev the input buffer + * @param singleScalarDev pointer to the scalar located in device memory + * @param len number of elements in the input and output buffer + * @param stream cuda stream + */ +template +void addDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { + // TODO: block dimension has not been tuned + dim3 block(256); + dim3 grid(raft::ceildiv(len, (IdxType)block.x)); + add_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh new file mode 100644 index 0000000000..f8142d9a82 --- /dev/null +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { + +template +__global__ void binaryOpKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len, Lambda op) { + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a, b; + OutVecType c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + c.val.data[i] = op(a.val.data[i], b.val.data[i]); + } + c.store(out, idx); +} + +template +void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, + IdxType len, Lambda op, cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + binaryOpKernel + <<>>(out, in1, in2, len, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief perform element-wise binary operation on the input arrays + * @tparam InType input data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val1, const InType& val2);` + */ +template +void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, + Lambda op, cudaStream_t stream) { + constexpr auto maxSize = + sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + if (16 / maxSize && bytes % 16 == 0) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (1 / maxSize) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else { + binaryOpImpl(out, in1, in2, len, + op, stream); + } +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh new file mode 100644 index 0000000000..ef983ff3d0 --- /dev/null +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { + +// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension +// of the matrix, i.e. reduce along rows for row major or reduce along columns +// for column major layout. Kernel does an inplace reduction adding to original +// values of dots. +template +__global__ void coalescedReductionKernel(OutType *dots, const InType *data, + int D, int N, OutType init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda final_op, + bool inplace = false) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + OutType thread_data = init; + IdxType rowStart = blockIdx.x * D; + for (IdxType i = threadIdx.x; i < D; i += TPB) { + IdxType idx = rowStart + i; + thread_data = reduce_op(thread_data, main_op(data[idx], i)); + } + OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op); + if (threadIdx.x == 0) { + if (inplace) { + dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc)); + } else { + dots[blockIdx.x] = final_op(acc); + } + } +} + +/** + * @brief Compute reduction of the input matrix along the leading dimension + * + * @tparam InType the data type of the input + * @tparam OutType the data type of the output (as well as the data type for + * which reduction is performed) + * @tparam IdxType data type of the indices of the array + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param dots the output reduction vector + * @param data the input matrix + * @param D leading dimension of data + * @param N second dimension data + * @param init initial value to use for the reduction + * @param main_op elementwise operation to apply before reduction + * @param reduce_op binary reduction operation + * @param final_op elementwise operation to apply before storing results + * @param inplace reduction result added inplace or overwrites old values? + * @param stream cuda stream where to launch work + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType *dots, const InType *data, int D, int N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) { + // One block per reduction + // Efficient only for large leading dimensions + if (D <= 32) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else if (D <= 64) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else if (D <= 128) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh new file mode 100644 index 0000000000..c848ac1f4b --- /dev/null +++ b/cpp/include/raft/linalg/divide.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @defgroup ScalarOps Scalar operations on the input buffer + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + * @{ + */ +template +void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { + unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, + stream); +} +/** @} */ + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh new file mode 100644 index 0000000000..6172618380 --- /dev/null +++ b/cpp/include/raft/linalg/eig.cuh @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { + +/** + * @defgroup eig decomp with divide and conquer method for the column-major + * symmetric matrices + * @param handle raft handle + * @param in the input buffer (symmetric matrix that has real eig values and + * vectors. + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param stream cuda stream + * @{ + */ +template +void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int lwork; + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, in, + n_cols, eig_vals, &lwork)); + + raft::mr::device::buffer d_work(allocator, stream, lwork); + raft::mr::device::buffer d_dev_info(allocator, stream, 1); + + raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); + CUDA_CHECK(cudaGetLastError()); + + int dev_info; + raft::update_host(&dev_info, d_dev_info.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + ASSERT(dev_info == 0, + "eig.cuh: eigensolver couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); +} + +enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; + +#if CUDART_VERSION >= 10010 + +/** + * @defgroup eig decomp with divide and conquer method for the column-major + * symmetric matrices + * @param handle raft handle + * @param in the input buffer (symmetric matrix that has real eig values and + * vectors. + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param n_eig_vals: number of eigenvectors to be generated + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param stream cuda stream + * @{ + */ +template +void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, + EigVecMemUsage memUsage, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int lwork; + int h_meig; + + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + + raft::mr::device::buffer d_work(allocator, stream, lwork); + raft::mr::device::buffer d_dev_info(allocator, stream, 1); + raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); + + if (memUsage == OVERWRITE_INPUT) { + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); + } else if (memUsage == COPY_INPUT) { + d_eig_vectors.resize(n_rows * n_cols, stream); + raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), + math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, + d_work.data(), lwork, d_dev_info.data(), stream)); + } + + CUDA_CHECK(cudaGetLastError()); + + int dev_info; + raft::update_host(&dev_info, d_dev_info.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + ASSERT(dev_info == 0, + "eig.cuh: eigensolver couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); + + if (memUsage == OVERWRITE_INPUT) { + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, + stream); + } else if (memUsage == COPY_INPUT) { + raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, + n_rows, n_eig_vals, stream); + } +} + +#endif + +/** + * @defgroup overloaded function for eig decomp with Jacobi method for the + * column-major symmetric matrices (in parameter) + * @param handle: raft handle + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param tol: error tolerance for the jacobi method. Algorithm stops when the + * error is below tol + * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better + * accuracy. + * @{ + */ +template +void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + syevjInfo_t syevj_params = nullptr; + CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); + CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); + CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); + + int lwork; + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, + eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + + raft::mr::device::buffer d_work(allocator, stream, lwork); + raft::mr::device::buffer dev_info(allocator, stream, 1); + + raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + dev_info.data(), syevj_params, stream)); + + int executed_sweeps; + CUSOLVER_CHECK( + cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + + CUDA_CHECK(cudaGetLastError()); + CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh new file mode 100644 index 0000000000..a46d550220 --- /dev/null +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "binary_op.cuh" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @defgroup ScalarOps Scalar operations on the input buffer + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + * @{ + */ +template +void scalarAdd(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in + scalar; }, + stream); +} + +template +void scalarMultiply(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, + stream); +} +/** @} */ + +/** + * @defgroup BinaryOps Element-wise binary operations on the input buffers + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in1 the first input buffer + * @param in2 the second input buffer + * @param len number of elements in the input buffers + * @param stream cuda stream where to launch work + * @{ + */ +template +void eltwiseAdd(math_t *out, const math_t *in1, const math_t *in2, IdxType len, + cudaStream_t stream) { + binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return a + b; }, + stream); +} + +template +void eltwiseSub(math_t *out, const math_t *in1, const math_t *in2, IdxType len, + cudaStream_t stream) { + binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return a - b; }, + stream); +} + +template +void eltwiseMultiply(math_t *out, const math_t *in1, const math_t *in2, + IdxType len, cudaStream_t stream) { + binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return a * b; }, + stream); +} + +template +void eltwiseDivide(math_t *out, const math_t *in1, const math_t *in2, + IdxType len, cudaStream_t stream) { + binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return a / b; }, + stream); +} + +template +void eltwiseDivideCheckZero(math_t *out, const math_t *in1, const math_t *in2, + IdxType len, cudaStream_t stream) { + binaryOp( + out, in1, in2, len, + [] __device__(math_t a, math_t b) { + if (b == math_t(0.0)) + return math_t(0.0); + else + return a / b; + }, + stream); +} +/** @} */ + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh new file mode 100644 index 0000000000..0a4897cc0b --- /dev/null +++ b/cpp/include/raft/linalg/gemm.cuh @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { + +/** + * @brief the wrapper of cublas gemm function + * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C + * @tparam math_t the type of input/output matrices + * @param handle raft handle + * @param a input matrix + * @param n_rows_a number of rows of A + * @param n_cols_a number of columns of A + * @param b input matrix + * @param c output matrix + * @param n_rows_c number of rows of C + * @param n_cols_c number of columns of C + * @param trans_a cublas transpose op for A + * @param trans_b cublas transpose op for B + * @param alpha scalar + * @param beta scalar + * @param stream cuda stream + */ +template +void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, + int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, + cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, + math_t beta, cudaStream_t stream) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int lda = trans_a == CUBLAS_OP_T ? k : m; + int ldb = trans_b == CUBLAS_OP_T ? n : k; + int ldc = m; + CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, + b, ldb, &beta, c, ldc, stream)); +} + +template +void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, + int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, + cublasOperation_t trans_a, cublasOperation_t trans_b, + cudaStream_t stream) { + math_t alpha = math_t(1); + math_t beta = math_t(0); + gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, + trans_b, alpha, beta, stream); +} + +/** + * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * combinations of operand layouts. + * It computes the following equation: Z = alpha . X * Y + beta . Z + * @tparam T Data type of input/output matrices (float/double) + * @param handle raft handle + * @param z output matrix of size M rows x N columns + * @param x input matrix of size M rows x K columns + * @param y input matrix of size K rows x N columns + * @param _M number of rows of X and Z + * @param _N number of rows of Y and columns of Z + * @param _K number of columns of X and rows of Y + * @param isZColMajor Storage layout of Z. true = col major, false = row major + * @param isXColMajor Storage layout of X. true = col major, false = row major + * @param isYColMajor Storage layout of Y. true = col major, false = row major + * @param stream cuda stream + * @param alpha scalar + * @param beta scalar + */ +template +void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, + int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, + cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + cublasOperation_t trans_a, trans_b; + T *a, *b, *c; + int lda, ldb, ldc; + int M, N, K; + // This function performs c = a * b. Based on the required output layout, + // either a = x, b = y or a = y, b = x. In either case c = z. + if (isZColMajor == true) { + // Result c is required in column major layout. Thus we perform, + // z = x * y + // Using BLAS call c = a * b. Therefore a = x, b = y and c = z + + a = x; + // If x is in row major layout, cublas needs to transpose x first, + // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major + // layout, trans_b needs to be CUBLAS_OP_N. + trans_a = isXColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; + // Set leading dimension appropriately + lda = isXColMajor == true ? _M : _K; + + b = y; + // If y is in row major layout, cublas needs to transpose y first, + // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major + // layout, trans_b needs to be CUBLAS_OP_N. + trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; + ldb = isYColMajor == true ? _K : _N; + + c = z; + ldc = _M; + M = _M; + N = _N; + K = _K; + } else { + // Result c is required in row major layout Thus we pick + // a = y, b = x and c = a * b = y * x + // cublas produces output matrix only in column major layout. To get output + // matrix on row major layout, we need to produce transpose of output + // in column major layout. Therefore we perform, + // tr(z) = tr(y) * tr(x) + // we model this using cublas call for c = a * b + // therefore a = tr(y), b = tr(x) and c = tr(z) + + a = y; + // If y is in row major layout, it can be/ interpreted as tr(y) on column + // major layout. Therefore we can pass trans_a as CUBLAS_OP_N. If y is in + // column major layout, cublas needs to transpose y first, therefore + // trans_a needs to be CUBLAS_OP_T + trans_a = isYColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N; + // Set leading dimension appropriately + lda = isYColMajor == true ? _K : _N; + + b = x; + // If x is in row major layout, it can be interpreted as tr(x) on column + // major layout. Therefore we can pass trans_b as CUBLAS_OP_N. If x is in + // column major layout, cublas needs to trasponse x first, therefore + // trans_b needs to be CUBLAS_OP_T + trans_b = isXColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N; + // Set leading dimension appropriately + ldb = isXColMajor == true ? _M : _K; + + c = z; + ldc = _N; + + M = _N; + N = _M; + K = _K; + } + // Actual cuBLAS call + CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, + b, ldb, &beta, c, ldc, stream)); +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h new file mode 100644 index 0000000000..edd18b3bee --- /dev/null +++ b/cpp/include/raft/linalg/gemv.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace raft { +namespace linalg { + +template +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, + const math_t* x, int incx, math_t* y, int incy, bool trans_a, + math_t alpha, math_t beta, cudaStream_t stream) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + + // Unfortunately there is a clash of terminology + // in BLAS https://docs.nvidia.com/cuda/cublas/index.html is opposite to Machine Learning + // In blas: + // m - number of rows in input matrix + // n - number of columns in input matrix + // lda - purpose of it to have ability to operate on submatrices of matrix without copying. + // If you're not think about it it's always should be equal to m + // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose + + // In Machine Learning: + // m - nunmber of columns in design matrix(number of features) + // n - number of rows in designed matrix (number of train examples) + + int m = n_rows; + int n = n_cols; + int lda = trans_a ? m : n; + + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, + y, incy, stream)); +} + +template +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, + int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha, + math_t beta, cudaStream_t stream) { + gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +template +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, + int n_cols_a, const math_t* x, math_t* y, bool trans_a, + cudaStream_t stream) { + math_t alpha = math_t(1); + math_t beta = math_t(0); + + gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh new file mode 100644 index 0000000000..1a6513b915 --- /dev/null +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { + +template +__device__ void reduce(Type *out, const Type acc) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Type tmp = BlockReduce(temp_storage).Sum(acc); + if (threadIdx.x == 0) { + raft::myAtomicAdd(out, tmp); + } +} + +template +__global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map, + const Type *in, Args... args) { + Type acc = (Type)0; + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (idx < len) { + acc = map(in[idx], args[idx]...); + } + + __syncthreads(); + + reduce(out, acc); +} + +template +void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream, + const Type *in, Args... args) { + CUDA_CHECK(cudaMemsetAsync(out, 0, sizeof(Type), stream)); + const int nblks = raft::ceildiv(len, (size_t)TPB); + mapThenSumReduceKernel + <<>>(out, len, map, in, args...); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief CUDA version of map and then sum reduction operation + * @tparam Type data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @param out the output sum-reduced value (assumed to be a device pointer) + * @param len number of elements in the input array + * @param map the device-lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays + */ + +template +void mapThenSumReduce(Type *out, size_t len, MapOp map, cudaStream_t stream, + const Type *in, Args... args) { + mapThenSumReduceImpl(out, len, map, stream, in, + args...); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh new file mode 100644 index 0000000000..902816418f --- /dev/null +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { + +template +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, + Lambda op) { + typedef TxN_t VecType; + IdxType len = N * D; + IdxType idx = threadIdx.x; + idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; + idx *= VecType::Ratio; + if (idx >= len) return; + IdxType vIdx; + VecType mat, vec; + ///@todo: yikes! use fast-int-div here. + ///@todo: shared mem for vector could help with perf + if (rowMajor && bcastAlongRows) { + vIdx = idx % D; + vec.load(vector, vIdx); + } else if (!rowMajor && !bcastAlongRows) { + vIdx = idx % N; + vec.load(vector, vIdx); + } else if (rowMajor && !bcastAlongRows) { + vIdx = idx / D; + vec.fill(vector[vIdx]); + } else { + vIdx = idx / N; + vec.fill(vector[vIdx]); + } + mat.load(matrix, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) + mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); + mat.store(out, idx); +} + +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType len = N * D; + IdxType nblks = + raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); + matrixVectorOpKernel + <<>>(out, matrix, vec, D, N, rowMajor, + bcastAlongRows, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Operations for all the columns or rows with a given vector. + * @tparam Type the matrix/vector type + * @tparam Lambda a device function which represents a binary operator + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output matrix (passing out = matrix makes it in-place) + * @param matrix the input matrix + * @param vec the vector + * @param D number of columns of matrix + * @param N number of rows of matrix + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether the broadcast of vector needs to happen along + * the rows of the matrix or columns + * @param op the mathematical operation + * @param stream cuda stream where to launch work + */ +template +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, + cudaStream_t stream) { + IdxType stride = rowMajor ? D : N; + size_t bytes = stride * sizeof(Type); + if (16 / sizeof(Type) && bytes % 16 == 0) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (8 / sizeof(Type) && bytes % 8 == 0) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (4 / sizeof(Type) && bytes % 4 == 0) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (2 / sizeof(Type) && bytes % 2 == 0) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (1 / sizeof(Type)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } +} + +///@todo: come up with a cleaner interface to support these cases in future! + +template +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector1, const Type *vector2, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op) { + typedef TxN_t VecType; + IdxType len = N * D; + IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; + if (idx >= len) return; + IdxType vIdx; + VecType mat, vec1, vec2; + ///@todo: yikes! use fast-int-div here. + ///@todo: shared mem for vector could help with perf + if (rowMajor && bcastAlongRows) { + vIdx = idx % D; + vec1.load(vector1, vIdx); + vec2.load(vector2, vIdx); + } else if (!rowMajor && !bcastAlongRows) { + vIdx = idx % N; + vec1.load(vector1, vIdx); + vec2.load(vector2, vIdx); + } else if (rowMajor && !bcastAlongRows) { + vIdx = idx / D; + vec1.fill(vector1[vIdx]); + vec2.fill(vector2[vIdx]); + } else { + vIdx = idx / N; + vec1.fill(vector1[vIdx]); + vec2.fill(vector2[vIdx]); + } + mat.load(matrix, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) + mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]); + mat.store(out, idx); +} + +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); + matrixVectorOpKernel + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, + bcastAlongRows, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Operations for all the columns or rows with the given vectors. + * @tparam Type the matrix/vector type + * @tparam Lambda a device function which represents a binary operator + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output matrix (passing out = matrix makes it in-place) + * @param matrix the input matrix + * @param vec1 the first vector + * @param vec2 the second vector + * @param D number of columns of matrix + * @param N number of rows of matrix + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether the broadcast of vector needs to happen along + * the rows of the matrix or columns + * @param op the mathematical operation + * @param stream cuda stream where to launch work + */ +template +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType stride = rowMajor ? D : N; + size_t bytes = stride * sizeof(Type); + if (16 / sizeof(Type) && bytes % 16 == 0) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (8 / sizeof(Type) && bytes % 8 == 0) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (4 / sizeof(Type) && bytes % 4 == 0) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (2 / sizeof(Type) && bytes % 2 == 0) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (1 / sizeof(Type)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh new file mode 100644 index 0000000000..9d1538c172 --- /dev/null +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "map_then_reduce.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief CUDA version mean squared error function mean((A-B)**2) + * @tparam math_t data-type upon which the math operation will be performed + * @tparam TPB threads-per-block + * @param out the output mean squared error value (assumed to be a device pointer) + * @param A input array (assumed to be a device pointer) + * @param B input array (assumed to be a device pointer) + * @param len number of elements in the input arrays + * @param weight weight to apply to every term in the mean squared error calculation + * @param stream cuda-stream where to launch this kernel + */ +template +void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, + math_t weight, cudaStream_t stream) { + auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { + math_t diff = a - b; + return diff * diff * weight / len; + }; + mapThenSumReduce(out, len, sq_diff, stream, A, + B); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh new file mode 100644 index 0000000000..ce948c927d --- /dev/null +++ b/cpp/include/raft/linalg/multiply.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @defgroup ScalarOps Scalar operations on the input buffer + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + * @{ + */ +template +void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { + unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, + stream); +} +/** @} */ + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh new file mode 100644 index 0000000000..64930a7123 --- /dev/null +++ b/cpp/include/raft/linalg/norm.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "reduce.cuh" + +namespace raft { +namespace linalg { + +/** different types of norms supported on the input buffers */ +enum NormType { L1Norm = 0, L2Norm }; + +/** + * @brief Compute row-wise norm of the input matrix and perform fin_op lambda + * + * Row-wise norm is useful while computing pairwise distance matrix, for + * example. + * This is used in many clustering algos like knn, kmeans, dbscan, etc... The + * current implementation is optimized only for bigger values of 'D'. + * + * @tparam Type the data type + * @tparam Lambda device final lambda + * @tparam IdxType Integer type used to for addressing + * @param dots the output vector of row-wise dot products + * @param data the input matrix (currently assumed to be row-major) + * @param D number of columns of data + * @param N number of rows of data + * @param type the type of norm to be applied + * @param rowMajor whether the input is row-major or not + * @param stream cuda stream where to launch work + * @param fin_op the final lambda op + */ +template > +void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, + bool rowMajor, cudaStream_t stream, + Lambda fin_op = raft::Nop()) { + switch (type) { + case L1Norm: + reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, + raft::L1Op(), raft::Sum(), fin_op); + break; + case L2Norm: + reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, + raft::L2Op(), raft::Sum(), fin_op); + break; + default: + ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +/** + * @brief Compute column-wise norm of the input matrix and perform fin_op + * @tparam Type the data type + * @tparam Lambda device final lambda + * @tparam IdxType Integer type used to for addressing + * @param dots the output vector of column-wise dot products + * @param data the input matrix (currently assumed to be row-major) + * @param D number of columns of data + * @param N number of rows of data + * @param type the type of norm to be applied + * @param rowMajor whether the input is row-major or not + * @param stream cuda stream where to launch work + * @param fin_op the final lambda op + */ +template > +void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, + bool rowMajor, cudaStream_t stream, + Lambda fin_op = raft::Nop()) { + switch (type) { + case L1Norm: + reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, + raft::L1Op(), raft::Sum(), fin_op); + break; + case L2Norm: + reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, + raft::L2Op(), raft::Sum(), fin_op); + break; + default: + ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh new file mode 100644 index 0000000000..cafa8d54f1 --- /dev/null +++ b/cpp/include/raft/linalg/qr.cuh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { + +/** + * @defgroup QRdecomp QR decomposition + * @{ + */ + +/** + * @brief compute QR decomp and return only Q matrix + * @param handle: raft handle + * @param M: input matrix + * @param Q: Q matrix to be returned (on GPU) + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream cuda stream + * @{ + */ +template +void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, + int n_rows, int n_cols, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows, n = n_cols; + int k = min(m, n); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + + raft::mr::device::buffer tau(allocator, stream, k); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); + + raft::mr::device::buffer devInfo(allocator, stream, 1); + int Lwork; + + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); + raft::mr::device::buffer workspace(allocator, stream, Lwork); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); + /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. +#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + CUSOLVER_CHECK( + cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + workspace.resize(Lwork, stream); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); +} + +/** + * @brief compute QR decomp and return both Q and R matrices + * @param handle: raft handle + * @param M: input matrix + * @param Q: Q matrix to be returned (on GPU) + * @param R: R matrix to be returned (on GPU) + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream cuda stream + */ +template +void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, + int n_rows, int n_cols, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows, n = n_cols; + raft::mr::device::buffer R_full(allocator, stream, m * n); + raft::mr::device::buffer tau(allocator, stream, min(m, n)); + CUDA_CHECK( + cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + int R_full_nrows = m, R_full_ncols = n; + CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + + int Lwork; + raft::mr::device::buffer devInfo(allocator, stream, 1); + + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, + R_full_ncols, R_full.data(), + R_full_nrows, &Lwork)); + raft::mr::device::buffer workspace(allocator, stream, Lwork); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, + tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. +#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); + + CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + int Q_nrows = m, Q_ncols = n; + + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, + min(Q_ncols, Q_nrows), Q, Q_nrows, + tau.data(), &Lwork)); + workspace.resize(Lwork, stream); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), + workspace.data(), Lwork, devInfo.data(), stream)); +} +/** @} */ + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh new file mode 100644 index 0000000000..d39577bbdd --- /dev/null +++ b/cpp/include/raft/linalg/reduce.cuh @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "coalesced_reduction.cuh" +#include "strided_reduction.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief Compute reduction of the input matrix along the requested dimension + * + * @tparam InType the data type of the input + * @tparam OutType the data type of the output (as well as the data type for + * which reduction is performed) + * @tparam IdxType data type of the indices of the array + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param dots the output reduction vector + * @param data the input matrix + * @param D number of columns + * @param N number of rows + * @param init initial value to use for the reduction + * @param rowMajor input matrix is row-major or not + * @param alongRows whether to reduce along rows or columns + * @param stream cuda stream where to launch work + * @param inplace reduction result added inplace or overwrites old values? + * @param main_op elementwise operation to apply before reduction + * @param reduce_op binary reduction operation + * @param final_op elementwise operation to apply before storing results + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void reduce(OutType *dots, const InType *data, int D, int N, OutType init, + bool rowMajor, bool alongRows, cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) { + if (rowMajor && alongRows) { + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); + } else if (rowMajor && !alongRows) { + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); + } else if (!rowMajor && alongRows) { + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, + reduce_op, final_op); + } else { + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, + reduce_op, final_op); + } +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh new file mode 100644 index 0000000000..fff09d2046 --- /dev/null +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +// Kernel to perform reductions along the strided dimension +// of the matrix, i.e. reduce along columns for row major or reduce along rows +// for column major layout +template +__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, + int N, Type init, MainLambda main_op) { + // Thread reduction + Type thread_data = Type(init); + int colStart = blockIdx.x * blockDim.x + threadIdx.x; + if (colStart < D) { + int rowStart = blockIdx.y * blockDim.y + threadIdx.y; + int stride = blockDim.y * gridDim.y; + for (int j = rowStart; j < N; j += stride) { + int idx = colStart + j * D; + thread_data += main_op(data[idx], j); + } + } + + // Block reduction + extern __shared__ char tmp[]; // One element per thread in block + Type *temp = (Type *)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; + temp[myidx] = thread_data; + __syncthreads(); + for (int j = blockDim.y / 2; j > 0; j /= 2) { + if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x]; + __syncthreads(); + } + + // Grid reduction + if ((colStart < D) && (threadIdx.y == 0)) + raft::myAtomicAdd(dots + colStart, temp[myidx]); +} + +// Kernel to perform reductions along the strided dimension +// of the matrix, i.e. reduce along columns for row major or reduce along rows +// for column major layout +template +__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, + int N, OutType init, MainLambda main_op, + ReduceLambda reduce_op) { + // Thread reduction + OutType thread_data = init; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + if (colStart < D) { + IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; + IdxType stride = blockDim.y * gridDim.y; + for (IdxType j = rowStart; j < N; j += stride) { + IdxType idx = colStart + j * D; + thread_data = reduce_op(thread_data, main_op(data[idx], j)); + } + } + + // Block reduction + extern __shared__ char tmp[]; // One element per thread in block + auto *temp = (OutType *)tmp; // Cast to desired type + IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); + temp[myidx] = thread_data; + __syncthreads(); + for (int j = blockDim.y / 2; j > 0; j /= 2) { + if (threadIdx.y < j) + temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + __syncthreads(); + } + + // Grid reduction + if ((colStart < D) && (threadIdx.y == 0)) + raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op); +} + +/** + * @brief Compute reduction of the input matrix along the strided dimension + * + * @tparam InType the data type of the input + * @tparam OutType the data type of the output (as well as the data type for + * which reduction is performed) + * @tparam IdxType data type of the indices of the array + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param dots the output reduction vector + * @param data the input matrix + * @param D leading dimension of data + * @param N second dimension data + * @param init initial value to use for the reduction + * @param main_op elementwise operation to apply before reduction + * @param reduce_op binary reduction operation + * @param final_op elementwise operation to apply before storing results + * @param inplace reduction result added inplace or overwrites old values? + * @param stream cuda stream where to launch work + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) { + ///@todo: this extra should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + if (!inplace) + raft::linalg::unaryOp( + dots, dots, D, [init] __device__(OutType a) { return init; }, stream); + + // Arbitrary numbers for now, probably need to tune + const dim3 thrds(32, 16); + IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), + raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); + const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; + + ///@todo: this complication should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + if (std::is_same>::value && + std::is_same::value) + stridedSummationKernel + <<>>(dots, data, D, N, init, main_op); + else + stridedReductionKernel + <<>>(dots, data, D, N, init, main_op, + reduce_op); + + ///@todo: this complication should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + // Perform final op on output data + if (!std::is_same>::value) + raft::linalg::unaryOp(dots, dots, D, final_op, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh new file mode 100644 index 0000000000..882c105689 --- /dev/null +++ b/cpp/include/raft/linalg/subtract.cuh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "binary_op.cuh" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief Elementwise scalar subtraction operation on the input buffer + * + * @tparam InT input data-type. Also the data-type upon which the math ops + * will be performed + * @tparam OutT output data-type + * @tparam IdxType Integer type used to for addressing + * + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + */ +template +void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, + cudaStream_t stream) { + auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; + unaryOp(out, in, len, op, stream); +} + +/** + * @brief Elementwise subtraction operation on the input buffers + * @tparam InT input data-type. Also the data-type upon which the math ops + * will be performed + * @tparam OutT output data-type + * @tparam IdxType Integer type used to for addressing + * + * @param out the output buffer + * @param in1 the first input buffer + * @param in2 the second input buffer + * @param len number of elements in the input buffers + * @param stream cuda stream where to launch work + */ +template +void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, + cudaStream_t stream) { + auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; + binaryOp(out, in1, in2, len, op, stream); +} + +template +__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { + //TODO: kernel do not use shared memory in current implementation + int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { + outDev[i] = inDev[i] - *singleScalarDev; + } +} + +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param outDev the output buffer + * @param inDev the input buffer + * @param singleScalarDev pointer to the scalar located in device memory + * @param len number of elements in the input and output buffer + * @param stream cuda stream + * @remark block size has not been tuned + */ +template +void subtractDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { + // Just for the note - there is no way to express such operation with cuBLAS in effective way + // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda + const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); + subtract_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh new file mode 100644 index 0000000000..7fb22bb2da --- /dev/null +++ b/cpp/include/raft/linalg/svd.cuh @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "eig.cuh" +#include "gemm.cuh" +#include "transpose.h" + +namespace raft { +namespace linalg { + +/** + * @brief singular value decomposition (SVD) on the column major float type + * input matrix using QR method + * @param handle: raft handle + * @param in: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param sing_vals: singular values of input matrix + * @param left_sing_vecs: left singular values of input matrix + * @param right_sing_vecs: right singular values of input matrix + * @param trans_right: transpose right vectors or not + * @param gen_left_vec: generate left eig vector. Not activated. + * @param gen_right_vec: generate right eig vector. Not activated. + * @param stream cuda stream + */ +// TODO: activate gen_left_vec and gen_right_vec options +// TODO: couldn't template this function due to cusolverDnSgesvd and +// cusolverSnSgesvd. Check if there is any other way. +template +void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, + T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, + bool trans_right, bool gen_left_vec, bool gen_right_vec, + cudaStream_t stream) { + std::shared_ptr allocator = + handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); + +#if CUDART_VERSION >= 10010 + // 46340: sqrt of max int value + ASSERT(n_rows <= 46340, + "svd solver is not supported for the data that has more than 46340 " + "samples (rows) " + "if you are using CUDA version 10.1. Please use other solvers such as " + "eig if it is available."); +#endif + + const int m = n_rows; + const int n = n_cols; + + raft::mr::device::buffer devInfo(allocator, stream, 1); + T *d_rwork = nullptr; + + int lwork = 0; + CUSOLVER_CHECK( + cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + raft::mr::device::buffer d_work(allocator, stream, lwork); + + char jobu = 'S'; + char jobvt = 'A'; + + if (!gen_left_vec) { + char new_u = 'N'; + strcpy(&jobu, &new_u); + } + + if (!gen_right_vec) { + char new_vt = 'N'; + strcpy(&jobvt, &new_vt); + } + + CUSOLVER_CHECK(cusolverDngesvd( + cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, + right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + + // Transpose the right singular vector back + if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); + + CUDA_CHECK(cudaGetLastError()); + + int dev_info; + raft::update_host(&dev_info, devInfo.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + ASSERT(dev_info == 0, + "svd.cuh: svd couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); +} + +template +void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, + T *U, T *V, bool gen_left_vec, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); + + int len = n_cols * n_cols; + raft::mr::device::buffer in_cross_mult(allocator, stream, len); + + T alpha = T(1); + T beta = T(0); + raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), + n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + stream); + + eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); + + raft::matrix::colReverse(V, n_cols, n_cols, stream); + raft::matrix::rowReverse(S, n_cols, 1, stream); + + raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); + + if (gen_left_vec) { + raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, + CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, + true, stream); + } +} + +/** + * @brief on the column major input matrix using Jacobi method + * @param handle: raft handle + * @param in: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param sing_vals: singular values of input matrix + * @param left_sing_vecs: left singular vectors of input matrix + * @param right_sing_vecs: right singular vectors of input matrix + * @param gen_left_vec: generate left eig vector. Not activated. + * @param gen_right_vec: generate right eig vector. Not activated. + * @param tol: error tolerance for the jacobi method. Algorithm stops when the + * error is below tol + * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better + * accuracy. + * @param stream cuda stream + */ +template +void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + math_t *sing_vals, math_t *left_sing_vecs, + math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, + math_t tol, int max_sweeps, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + gesvdjInfo_t gesvdj_params = NULL; + + CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params)); + CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol)); + CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps)); + + int m = n_rows; + int n = n_cols; + + raft::mr::device::buffer devInfo(allocator, stream, 1); + + int lwork = 0; + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + + raft::mr::device::buffer d_work(allocator, stream, lwork); + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), + gesvdj_params, stream)); + + CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +/** + * @brief reconstruct a matrix use left and right singular vectors and + * singular values + * @param handle: raft handle + * @param U: left singular vectors of size n_rows x k + * @param S: square matrix with singular values on its diagonal, k x k + * @param V: right singular vectors of size n_cols x k + * @param out: reconstructed matrix to be returned + * @param n_rows: number rows of output matrix + * @param n_cols: number columns of output matrix + * @param k: number of singular values + * @param stream cuda stream + */ +template +void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, + math_t *V, math_t *out, int n_rows, int n_cols, int k, + cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + + const math_t alpha = 1.0, beta = 0.0; + raft::mr::device::buffer SVT(allocator, stream, k * n_cols); + + raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, + CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, + CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); +} + +/** + * @brief reconstruct a matrix use left and right singular vectors and + * singular values + * @param handle: raft handle + * @param A_d: input matrix + * @param U: left singular vectors of size n_rows x k + * @param S_vec: singular values as a vector + * @param V: right singular vectors of size n_cols x k + * @param n_rows: number rows of output matrix + * @param n_cols: number columns of output matrix + * @param k: number of singular values to be computed, 1.0 for normal SVD + * @param tol: tolerance for the evaluation + * @param stream cuda stream + */ +template +bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, + math_t *S_vec, math_t *V, int n_rows, int n_cols, + int k, math_t tol, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); + cublasHandle_t cublasH = handle.get_cublas_handle(); + + int m = n_rows, n = n_cols; + + // form product matrix + raft::mr::device::buffer P_d(allocator, stream, m * n); + raft::mr::device::buffer S_mat(allocator, stream, k * k); + CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); + CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); + + raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream); + svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream); + + // get norms of each + math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream); + math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream); + math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream); + math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream); + math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream); + + // calculate percent error + const math_t alpha = 1.0, beta = -1.0; + raft::mr::device::buffer A_minus_P(allocator, stream, m * n); + CUDA_CHECK( + cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, + &alpha, A_d, m, &beta, P_d.data(), m, + A_minus_P.data(), m, stream)); + + math_t norm_A_minus_P = + raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; + return (percent_error / 100.0 < tol); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h new file mode 100644 index 0000000000..d90f6271fa --- /dev/null +++ b/cpp/include/raft/linalg/transpose.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { + +/** + * @brief transpose on the column major input matrix using Jacobi method + * @param handle: raft handle + * @param in: input matrix + * @param out: output. Transposed input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream: cuda stream + */ +template +void transpose(const raft::handle_t &handle, math_t *in, math_t *out, + int n_rows, int n_cols, cudaStream_t stream) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + int out_n_rows = n_cols; + int out_n_cols = n_rows; + + const math_t alpha = 1.0; + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam( + cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, + n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); +} + +/** + * @brief transpose on the column major input matrix using Jacobi method + * @param inout: input and output matrix + * @param n: number of rows and columns of input matrix + * @param stream: cuda stream + */ +template +void transpose(math_t *inout, int n, cudaStream_t stream) { + auto m = n; + auto size = n * n; + auto d_inout = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh new file mode 100644 index 0000000000..46b4d296cb --- /dev/null +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { + +template +__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, + Lambda op) { + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a; + OutVecType b; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + b.val.data[i] = op(a.val.data[i]); + } + b.store(out, idx); +} + +template +void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + unaryOpKernel + <<>>(out, in, len, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief perform element-wise unary operation in the input array + * @tparam InType input data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in the input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val);` + */ +template +void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + if (len <= 0) return; //silently skip in case of 0 length input + constexpr auto maxSize = + sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && + outAddr % 16 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && + outAddr % 8 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && + outAddr % 4 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && + outAddr % 2 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (1 / maxSize) { + unaryOpImpl( + out, in, len, op, stream); + } else { + unaryOpImpl(out, in, len, op, + stream); + } +} + +template +__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + if (idx < len) { + op(out + idx, idx); + } +} + +/** + * @brief Perform an element-wise unary operation into the output array + * + * Compared to `unaryOp()`, this method does not do any reads from any inputs + * + * @tparam OutType output data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * + * @param[out] out the output array [on device] [len = len] + * @param[in] len number of elements in the input array + * @param[in] op the device-lambda which must be of the form: + * `void func(OutType* outLocationOffset, IdxType idx);` + * where outLocationOffset will be out + idx. + * @param[in] stream cuda stream where to launch work + */ +template +void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, + cudaStream_t stream) { + if (len <= 0) return; // silently skip in case of 0 length input + auto nblks = raft::ceildiv(len, TPB); + writeOnlyUnaryOpKernel + <<>>(out, len, op); + CUDA_CHECK(cudaGetLastError()); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh new file mode 100644 index 0000000000..0a72117140 --- /dev/null +++ b/cpp/include/raft/matrix/math.cuh @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace matrix { + +/** + * @defgroup MatrixMathOp math operation on the input matrix + * @{ + */ + +/** + * @brief Power of every element in the input matrix + * @param in: input matrix + * @param out: output matrix. The result is stored in the out matrix + * @param scalar: every element is multiplied with scalar. + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void power(math_t *in, math_t *out, math_t scalar, int len, + cudaStream_t stream) { + auto d_src = in; + auto d_dest = out; + + raft::linalg::binaryOp( + d_dest, d_src, d_src, len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); +} + +/** + * @brief Power of every element in the input matrix + * @param inout: input matrix and also the result is stored + * @param scalar: every element is multiplied with scalar. + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { + power(inout, inout, scalar, len, stream); +} + +/** + * @brief Power of every element in the input matrix + * @param inout: input matrix and also the result is stored + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void power(math_t *inout, int len, cudaStream_t stream) { + math_t scalar = 1.0; + power(inout, scalar, len, stream); +} + +/** + * @brief Power of every element in the input matrix + * @param in: input matrix + * @param out: output matrix. The result is stored in the out matrix + * @param len: number elements of input matrix + * @param stream cuda stream + * @{ + */ +template +void power(math_t *in, math_t *out, int len, cudaStream_t stream) { + math_t scalar = 1.0; + power(in, out, scalar, len, stream); +} + +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param in: input matrix and also the result is stored + * @param out: output matrix. The result is stored in the out matrix + * @param scalar: every element is multiplied with scalar + * @param len: number elements of input matrix + * @param stream cuda stream + * @param set_neg_zero whether to set negative numbers to zero + */ +template +void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, + cudaStream_t stream, bool set_neg_zero = false) { + auto d_src = in; + auto d_dest = out; + + raft::linalg::unaryOp( + d_dest, d_src, len, + [=] __device__(math_t a) { + if (set_neg_zero) { + if (a < math_t(0)) { + return math_t(0); + } else { + return sqrt(a * scalar); + } + } else { + return sqrt(a * scalar); + } + }, + stream); +} + +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param inout: input matrix and also the result is stored + * @param scalar: every element is multiplied with scalar + * @param len: number elements of input matrix + * @param stream cuda stream + * @param set_neg_zero whether to set negative numbers to zero + */ +template +void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, + bool set_neg_zero = false) { + seqRoot(inout, inout, scalar, len, stream, set_neg_zero); +} + +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param in: input matrix and also the result is stored + * @param out: output matrix. The result is stored in the out matrix + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { + math_t scalar = 1.0; + seqRoot(in, out, scalar, len, stream); +} + +template +void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { + math_t scalar = 1.0; + seqRoot(inout, inout, scalar, len, stream); +} + +template +void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, + cudaStream_t stream, math_t thres = 1e-15) { + raft::linalg::unaryOp( + out, in, len, + [=] __device__(math_t a) { + if (a <= thres && -a <= thres) { + return math_t(0); + } else { + return a; + } + }, + stream); +} + +/** + * @brief sets the small values to zero based on a defined threshold + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param inout: input matrix and also the result is stored + * @param len: number elements of input matrix + * @param stream cuda stream + * @param thres: threshold + */ +template +void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, + math_t thres = 1e-15) { + setSmallValuesZero(inout, inout, len, stream, thres); +} + +/** + * @brief Reciprocal of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param in: input matrix and also the result is stored + * @param out: output matrix. The result is stored in the out matrix + * @param scalar: every element is multiplied with scalar + * @param len: number elements of input matrix + * @param stream cuda stream + * @param setzero round down to zero if the input is less the threshold + * @param thres the threshold used to forcibly set inputs to zero + * @{ + */ +template +void reciprocal(math_t *in, math_t *out, math_t scalar, int len, + cudaStream_t stream, bool setzero = false, + math_t thres = 1e-15) { + auto d_src = in; + auto d_dest = out; + + raft::linalg::unaryOp( + d_dest, d_src, len, + [=] __device__(math_t a) { + if (setzero) { + if (abs(a) <= thres) { + return math_t(0); + } else { + return scalar / a; + } + } else { + return scalar / a; + } + }, + stream); +} + +/** + * @brief Reciprocal of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param inout: input matrix and also the result is stored + * @param scalar: every element is multiplied with scalar + * @param len: number elements of input matrix + * @param stream cuda stream + * @param setzero: (default false) when true and |value| result = 0) + */ +template +void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, + bool setzero = false, math_t thres = 1e-15) { + reciprocal(inout, inout, scalar, len, stream, setzero, thres); +} + +/** + * @brief Reciprocal of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param inout: input matrix and also the result is stored + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { + math_t scalar = 1.0; + reciprocal(inout, scalar, len, stream); +} + +/** + * @brief Reciprocal of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param in: input matrix and also the result is stored + * @param out: output matrix. The result is stored in the out matrix + * @param len: number elements of input matrix + * @param stream cuda stream + */ +template +void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { + math_t scalar = 1.0; + reciprocal(in, out, scalar, len, stream); +} + +template +void setValue(math_t *out, const math_t *in, math_t scalar, int len, + cudaStream_t stream = 0) { + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); +} + +/** + * @brief ratio of every element over sum of input vector is calculated + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param src: input matrix + * @param dest: output matrix. The result is stored in the dest matrix + * @param len: number elements of input matrix + * @param allocator device allocator + * @param stream cuda stream + */ +template +void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, + cudaStream_t stream) { + auto d_src = src; + auto d_dest = dest; + + std::shared_ptr allocator = + handle.get_device_allocator(); + + raft::mr::device::buffer d_sum(allocator, stream, 1); + auto *d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; + raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); + raft::linalg::unaryOp( + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, + stream); +} + +/** @} */ + +// Computes the argmax(d_in) column-wise in a DxN matrix +template +__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { + typedef cub::BlockReduce, TPB> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // compute maxIndex=argMax index for column + using KVP = cub::KeyValuePair; + int rowStart = blockIdx.x * D; + KVP thread_data(-1, -raft::myInf()); + + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); + } + + auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); + + if (threadIdx.x == 0) { + argmax[blockIdx.x] = maxKV.key; + } +} + +/** + * @brief Argmax: find the row idx with maximum value for each column + * @param in: input matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param out: output vector of size n_cols + * @param stream: cuda stream + */ +template +void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, + cudaStream_t stream) { + int D = n_rows; + int N = n_cols; + if (D <= 32) { + argmaxKernel<<>>(in, D, N, out); + } else if (D <= 64) { + argmaxKernel<<>>(in, D, N, out); + } else if (D <= 128) { + argmaxKernel<<>>(in, D, N, out); + } else { + argmaxKernel<<>>(in, D, N, out); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +// Utility kernel needed for signFlip. +// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by +// flipping the sign if the |max| value for each column is negative. +template +__global__ void signFlipKernel(T *d_in, int D, int N) { + typedef cub::BlockReduce, TPB> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // compute maxIndex=argMax (with abs()) index for column + using KVP = cub::KeyValuePair; + int rowStart = blockIdx.x * D; + KVP thread_data(0, 0); + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); + } + auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); + + // flip column sign if d_in[maxIndex] < 0 + __shared__ bool need_sign_flip; + if (threadIdx.x == 0) { + need_sign_flip = d_in[maxKV.key] < T(0); + } + __syncthreads(); + + if (need_sign_flip) { + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + d_in[idx] = -d_in[idx]; + } + } +} + +/** + * @brief sign flip for PCA. This is used to stabilize the sign of column + * major eigen vectors. Flips the sign if the column has negative |max|. + * @param inout: input matrix. Result also stored in this parameter + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param stream cuda stream + */ +template +void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { + int D = n_rows; + int N = n_cols; + auto data = inout; + if (D <= 32) { + signFlipKernel<<>>(data, D, N); + } else if (D <= 64) { + signFlipKernel<<>>(data, D, N); + } else if (D <= 128) { + signFlipKernel<<>>(data, D, N); + } else { + signFlipKernel<<>>(data, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, stream); +} + +template +void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, + bool bcastAlongRows, cudaStream_t stream) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { + if (b == Type(0)) + return a; + else + return a * b; + }, + stream); +} + +template +void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, stream); +} + +template +void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, + bool bcastAlongRows, cudaStream_t stream, + bool return_zero = false) { + if (return_zero) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { + if (raft::myAbs(b) < Type(1e-10)) + return Type(0); + else + return a / b; + }, + stream); + } else { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { + if (raft::myAbs(b) < Type(1e-10)) + return a; + else + return a / b; + }, + stream); + } +} + +template +void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, stream); +} + +template +void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { + raft::linalg::matrixVectorOp( + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, stream); +} + +}; // end namespace matrix +}; // end namespace raft diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh new file mode 100644 index 0000000000..ec7ea984db --- /dev/null +++ b/cpp/include/raft/matrix/matrix.cuh @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace matrix { + +using namespace std; + +/** + * @brief Copy selected rows of the input matrix into contiguous space. + * + * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows], + * where i = 0..n_rows_indices-1, and k = 0..n_cols-1. + * + * @param in input matrix + * @param n_rows number of rows of output matrix + * @param n_cols number of columns of output matrix + * @param out output matrix + * @param indices of the rows to be copied + * @param n_rows_indices number of rows to copy + * @param stream cuda stream + * @param rowMajor whether the matrix has row major layout + */ +template +void copyRows(const m_t *in, int n_rows, int n_cols, m_t *out, + const int *indices, int n_rows_indices, cudaStream_t stream, + bool rowMajor = false) { + if (rowMajor) { + ASSERT(false, "matrix.h: row major is not supported yet!"); + } + + auto size = n_rows_indices * n_cols; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(int idx) { + int row = idx % n_rows_indices; + int col = idx / n_rows_indices; + + out[col * n_rows_indices + row] = + in[col * n_rows + indices[row]]; + }); +} + +/** + * @brief copy matrix operation for column major matrices. + * @param in: input matrix + * @param out: output matrix + * @param n_rows: number of rows of output matrix + * @param n_cols: number of columns of output matrix + * @param stream: cuda stream + */ +template +void copy(const m_t *in, m_t *out, int n_rows, int n_cols, + cudaStream_t stream) { + raft::copy_async(out, in, n_rows * n_cols, stream); +} + +/** + * @brief copy matrix operation for column major matrices. First n_rows and + * n_cols of input matrix "in" is copied to "out" matrix. + * @param in: input matrix + * @param in_n_rows: number of rows of input matrix + * @param out: output matrix + * @param out_n_rows: number of rows of output matrix + * @param out_n_cols: number of columns of output matrix + * @param stream: cuda stream + */ +template +void truncZeroOrigin(m_t *in, int in_n_rows, m_t *out, int out_n_rows, + int out_n_cols, cudaStream_t stream) { + auto m = out_n_rows; + auto k = in_n_rows; + auto size = out_n_rows * out_n_cols; + auto d_q = in; + auto d_q_trunc = out; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(int idx) { + int row = idx % m; + int col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); +} + +/** + * @brief Columns of a column major matrix is reversed (i.e. first column and + * last column are swapped) + * @param inout: input and output matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param stream: cuda stream + */ +template +void colReverse(m_t *inout, int n_rows, int n_cols, cudaStream_t stream) { + auto n = n_cols; + auto m = n_rows; + auto size = n_rows * n_cols; + auto d_q = inout; + auto d_q_reversed = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(thrust::cuda::par.on(stream), counting, + counting + (size / 2), [=] __device__(int idx) { + int dest_row = idx % m; + int dest_col = idx / m; + int src_row = dest_row; + int src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); +} + +/** + * @brief Rows of a column major matrix is reversed (i.e. first row and last + * row are swapped) + * @param inout: input and output matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param stream: cuda stream + */ +template +void rowReverse(m_t *inout, int n_rows, int n_cols, cudaStream_t stream) { + auto m = n_rows; + auto size = n_rows * n_cols; + auto d_q = inout; + auto d_q_reversed = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(thrust::cuda::par.on(stream), counting, + counting + (size / 2), [=] __device__(int idx) { + int dest_row = idx % m; + int dest_col = idx / m; + int src_row = (m - dest_row) - 1; + ; + int src_col = dest_col; + + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); +} + +/** + * @brief Prints the data stored in GPU memory + * @param in: input matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param h_separator: horizontal separator character + * @param v_separator: vertical separator character + */ +template +void print(const m_t *in, int n_rows, int n_cols, char h_separator = ' ', + char v_separator = '\n') { + std::vector h_matrix = std::vector(n_cols * n_rows); + CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), + cudaMemcpyDeviceToHost)); + + for (auto i = 0; i < n_rows; i++) { + for (auto j = 0; j < n_cols; j++) { + printf("%1.4f%c", h_matrix[j * n_rows + i], + j < n_cols - 1 ? h_separator : v_separator); + } + } +} + +/** + * @brief Prints the data stored in CPU memory + * @param in: input matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + */ +template +void printHost(const m_t *in, int n_rows, int n_cols) { + for (auto i = 0; i < n_rows; i++) { + for (auto j = 0; j < n_cols; j++) { + printf("%1.4f ", in[j * n_rows + i]); + } + printf("\n"); + } +} + +/** + * @brief Kernel for copying a slice of a big matrix to a small matrix with a + * size matches that slice + * @param src_d: input matrix + * @param m: number of rows of input matrix + * @param n: number of columns of input matrix + * @param dst_d: output matrix + * @param x1, y1: coordinate of the top-left point of the wanted area (0-based) + * @param x2, y2: coordinate of the bottom-right point of the wanted area + * (1-based) + */ +template +__global__ void slice(m_t *src_d, int m, int n, m_t *dst_d, int x1, int y1, + int x2, int y2) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + int dm = x2 - x1, dn = y2 - y1; + if (idx < dm * dn) { + int i = idx % dm, j = idx / dm; + int is = i + x1, js = j + y1; + dst_d[idx] = src_d[is + js * m]; + } +} + +/** + * @brief Slice a matrix (in-place) + * @param in: input matrix + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param out: output matrix + * @param x1, y1: coordinate of the top-left point of the wanted area (0-based) + * @param x2, y2: coordinate of the bottom-right point of the wanted area + * (1-based) + * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4, + * 3, 0, 1, 4, 3); + * @param stream: cuda stream + */ +template +void sliceMatrix(m_t *in, int n_rows, int n_cols, m_t *out, int x1, int y1, + int x2, int y2, cudaStream_t stream) { + // Slicing + dim3 block(64); + dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); + slice<<>>(in, n_rows, n_cols, out, x1, y1, x2, y2); +} + +/** + * @brief Kernel for copying the upper triangular part of a matrix to another + * @param src: input matrix with a size of mxn + * @param dst: output matrix with a size of kxk + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param k: min(n_rows, n_cols) + */ +template +__global__ void getUpperTriangular(m_t *src, m_t *dst, int n_rows, int n_cols, + int k) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + int m = n_rows, n = n_cols; + if (idx < m * n) { + int i = idx % m, j = idx / m; + if (i < k && j < k && j >= i) { + dst[i + j * k] = src[idx]; + } + } +} + +/** + * @brief Copy the upper triangular part of a matrix to another + * @param src: input matrix with a size of n_rows x n_cols + * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols) + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param stream: cuda stream + */ +template +void copyUpperTriangular(m_t *src, m_t *dst, int n_rows, int n_cols, + cudaStream_t stream) { + int m = n_rows, n = n_cols; + int k = min(m, n); + dim3 block(64); + dim3 grid((m * n + block.x - 1) / block.x); + getUpperTriangular<<>>(src, dst, m, n, k); +} + +/** + * @brief Copy a vector to the diagonal of a matrix + * @param vec: vector of length k = min(n_rows, n_cols) + * @param matrix: matrix of size n_rows x n_cols + * @param m: number of rows of the matrix + * @param n: number of columns of the matrix + * @param k: dimensionality + */ +template +__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, int m, int n, + int k) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + + if (idx < k) { + matrix[idx + idx * m] = vec[idx]; + } +} + +/** + * @brief Initialize a diagonal matrix with a vector + * @param vec: vector of length k = min(n_rows, n_cols) + * @param matrix: matrix of size n_rows x n_cols + * @param n_rows: number of rows of the matrix + * @param n_cols: number of columns of the matrix + * @param stream: cuda stream + */ +template +void initializeDiagonalMatrix(m_t *vec, m_t *matrix, int n_rows, int n_cols, + cudaStream_t stream) { + int k = min(n_rows, n_cols); + dim3 block(64); + dim3 grid((k + block.x - 1) / block.x); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, + n_cols, k); +} + +/** + * @brief Calculate the inverse of the diagonal of a square matrix + * element-wise and in place + * @param in: square input matrix with size len x len + * @param len: size of one side of the matrix + */ +template +__global__ void matrixDiagonalInverse(m_t *in, int len) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < len) { + in[idx + idx * len] = 1.0 / in[idx + idx * len]; + } +} + +/** + * @brief Get a square matrix with elements on diagonal reversed (in-place) + * @param in: square input matrix with size len x len + * @param len: size of one side of the matrix + * @param stream: cuda stream + */ +template +void getDiagonalInverseMatrix(m_t *in, int len, cudaStream_t stream) { + dim3 block(64); + dim3 grid((len + block.x - 1) / block.x); + matrixDiagonalInverse<<>>(in, len); +} + +/** + * @brief Get the L2/F-norm of a matrix/vector + * @param in: input matrix/vector with totally size elements + * @param size: size of the matrix/vector + * @param cublasH cublas handle + * @param stream: cuda stream + */ +template +m_t getL2Norm(const raft::handle_t &handle, m_t *in, int size, + cudaStream_t stream) { + cublasHandle_t cublasH = handle.get_cublas_handle(); + m_t normval = 0; + CUBLAS_CHECK( + raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + return normval; +} + +}; // end namespace matrix +}; // end namespace raft diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh new file mode 100644 index 0000000000..56710ea81f --- /dev/null +++ b/cpp/include/raft/random/rng.cuh @@ -0,0 +1,676 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "rng_impl.cuh" + +namespace raft { +namespace random { + +/** all different generator types used */ +enum GeneratorType { + /** curand-based philox generator */ + GenPhilox = 0, + /** LFSR taps generator */ + GenTaps, + /** kiss99 generator (currently the fastest) */ + GenKiss99 +}; + +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, + LenType len, Lambda randOp) { + LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; + detail::Generator gen(seed, (uint64_t)tid, offset); + const LenType stride = gridDim.x * blockDim.x; + for (LenType idx = tid; idx < len; idx += stride) { + MathType val; + gen.next(val); + ptr[idx] = randOp(val, idx); + } +} + +// used for Box-Muller type transformations +template +__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, + LenType len, Lambda2 rand2Op) { + LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; + detail::Generator gen(seed, (uint64_t)tid, offset); + const LenType stride = gridDim.x * blockDim.x; + for (LenType idx = tid; idx < len; idx += stride) { + MathType val1, val2; + gen.next(val1); + gen.next(val2); + rand2Op(val1, val2, idx, idx + stride); + if (idx < len) ptr[idx] = (OutType)val1; + idx += stride; + if (idx < len) ptr[idx] = (OutType)val2; + } +} + +template +__global__ void constFillKernel(Type *ptr, int len, Type val) { + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; + const unsigned stride = gridDim.x * blockDim.x; + for (unsigned idx = tid; idx < len; idx += stride) { + ptr[idx] = val; + } +} + +/** + * @brief Helper method to compute Box Muller transform + * + * @tparam Type data type + * + * @param[inout] val1 first value + * @param[inout] val2 second value + * @param[in] sigma1 standard deviation of output gaussian for first value + * @param[in] mu1 mean of output gaussian for first value + * @param[in] sigma2 standard deviation of output gaussian for second value + * @param[in] mu2 mean of output gaussian for second value + * @{ + */ +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, + Type sigma2, Type mu2) { + constexpr Type twoPi = Type(2.0) * Type(3.141592654); + constexpr Type minus2 = -Type(2.0); + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; + Type s, c; + raft::mySinCos(theta, s, c); + val1 = R * c * sigma1 + mu1; + val2 = R * s * sigma2 + mu2; +} +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { + box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); +} +/** @} */ + +/** The main random number generator class, fully on GPUs */ +class Rng { + public: + /** + * @brief ctor + * @param _s 64b seed used to initialize the RNG + * @param _t backend device RNG generator type + * @note Refer to the `Rng::seed` method for details about seeding the engine + */ + Rng(uint64_t _s, GeneratorType _t = GenPhilox) + : type(_t), + offset(0), + // simple heuristic to make sure all SMs will be occupied properly + // and also not too many initialization calls will be made by each thread + nBlocks(4 * getMultiProcessorCount()), + gen() { + seed(_s); + } + + /** + * @brief Seed (and thus re-initialize) the underlying RNG engine + * @param _s 64b seed used to initialize the RNG + * @note If you need non-reproducibility, pass a seed that's, for example, a + * function of timestamp. Another example is to use the c++11's + * `std::random_device` for setting seed. + */ + void seed(uint64_t _s) { + gen.seed(_s); + offset = 0; + } + + /** + * @brief Generates the 'a' and 'b' parameters for a modulo affine + * transformation equation: `(ax + b) % n` + * + * @tparam IdxT integer type + * + * @param[in] n the modulo range + * @param[out] a slope parameter + * @param[out] b intercept parameter + */ + template + void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + // always keep 'a' to be coprime to 'n' + a = gen() % n; + while (gcd(a, n) != 1) { + ++a; + if (a >= n) a = 0; + } + // the bias term 'b' can be any number in the range of [0, n) + b = gen() % n; + } + + /** + * @brief Generate uniformly distributed numbers in the given range + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param start start of the range + * @param end end of the range + * @param stream stream where to launch the kernel + * @{ + */ + template + void uniform(Type *ptr, LenType len, Type start, Type end, + cudaStream_t stream) { + static_assert(std::is_floating_point::value, + "Type for 'uniform' can only be floating point!"); + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + return (val * (end - start)) + start; + }, + stream); + } + template + void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, + cudaStream_t stream) { + static_assert(std::is_integral::value, + "Type for 'uniformInt' can only be integer!"); + custom_distribution( + ptr, len, + [=] __device__(IntType val, LenType idx) { + return (val % (end - start)) + start; + }, + stream); + } + /** @} */ + + /** + * @brief Generate normal distributed numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param mu mean of the distribution + * @param sigma std-dev of the distribution + * @param stream stream where to launch the kernel + * @{ + */ + template + void normal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { + static_assert(std::is_floating_point::value, + "Type for 'normal' can only be floating point!"); + rand2Impl( + offset, ptr, len, + [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { + box_muller_transform(val1, val2, sigma, mu); + }, + NumThreads, nBlocks, type, stream); + } + template + void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, + cudaStream_t stream) { + static_assert(std::is_integral::value, + "Type for 'normalInt' can only be integer!"); + rand2Impl( + offset, ptr, len, + [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { + box_muller_transform(val1, val2, sigma, mu); + }, + NumThreads, nBlocks, type, stream); + } + /** @} */ + + /** + * @brief Generate normal distributed table according to the given set of + * means and scalar standard deviations. + * + * Each row in this table conforms to a normally distributed n-dim vector + * whose mean is the input vector and standard deviation is the corresponding + * vector or scalar. Correlations among the dimensions itself is assumed to + * be absent. + * + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output table (dim = n_rows x n_cols) + * @param n_rows number of rows in the table + * @param n_cols number of columns in the table + * @param mu mean vector (dim = n_cols x 1). + * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass + * a nullptr to use the same scalar 'sigma' across all components + * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr + * @param stream stream where to launch the kernel + */ + template + void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, + const Type *sigma_vec, Type sigma, cudaStream_t stream) { + rand2Impl( + offset, ptr, n_rows * n_cols, + [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { + // yikes! use fast-int-div + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; + auto mean1 = mu[col1]; + auto mean2 = mu[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); + }, + NumThreads, nBlocks, type, stream); + } + + /** + * @brief Fill an array with the given value + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param val value to be filled + * @param stream stream where to launch the kernel + */ + template + void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + constFillKernel<<>>(ptr, len, val); + CUDA_CHECK(cudaPeekAtLastError()); + } + + /** + * @brief Generate bernoulli distributed boolean array + * + * @tparam Type data type in which to compute the probabilities + * @tparam OutType output data type + * @tparam LenType data type used to represent length of the arrays + * + * @param[out] ptr the output array + * @param[in] len the number of elements in the output + * @param[in] prob coin-toss probability for heads + * @param[in] stream stream where to launch the kernel + */ + template + void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + custom_distribution( + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, + stream); + } + + /** + * @brief Generate bernoulli distributed array and applies scale + * @tparam Type data type in which to compute the probabilities + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param prob coin-toss probability for heads + * @param scale scaling factor + * @param stream stream where to launch the kernel + */ + template + void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, + cudaStream_t stream) { + static_assert(std::is_floating_point::value, + "Type for 'scaled_bernoulli' can only be floating point!"); + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + return val > prob ? -scale : scale; + }, + stream); + } + + /** + * @brief Generate gumbel distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu mean value + * @param beta scale value + * @param stream stream where to launch the kernel + * @note https://en.wikipedia.org/wiki/Gumbel_distribution + */ + template + void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + return mu - beta * raft::myLog(-raft::myLog(val)); + }, + stream); + } + + /** + * @brief Generate lognormal distributed numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param mu mean of the distribution + * @param sigma std-dev of the distribution + * @param stream stream where to launch the kernel + */ + template + void lognormal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { + rand2Impl( + offset, ptr, len, + [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { + box_muller_transform(val1, val2, sigma, mu); + val1 = raft::myExp(val1); + val2 = raft::myExp(val2); + }, + NumThreads, nBlocks, type, stream); + } + + /** + * @brief Generate logistic distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu mean value + * @param scale scale value + * @param stream stream where to launch the kernel + */ + template + void logistic(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + constexpr Type one = (Type)1.0; + return mu - scale * raft::myLog(one / val - one); + }, + stream); + } + + /** + * @brief Generate exponentially distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param lambda the lambda + * @param stream stream where to launch the kernel + */ + template + void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + constexpr Type one = (Type)1.0; + return -raft::myLog(one - val) / lambda; + }, + stream); + } + + /** + * @brief Generate rayleigh distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param sigma the sigma + * @param stream stream where to launch the kernel + */ + template + void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; + return raft::mySqrt(-two * raft::myLog(one - val)) * sigma; + }, + stream); + } + + /** + * @brief Generate laplace distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu the mean + * @param scale the scale + * @param stream stream where to launch the kernel + */ + template + void laplace(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { + custom_distribution( + ptr, len, + [=] __device__(Type val, LenType idx) { + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; + constexpr Type oneHalf = (Type)0.5; + Type out; + if (val <= oneHalf) { + out = mu + scale * raft::myLog(two * val); + } else { + out = mu - scale * raft::myLog(two * (one - val)); + } + return out; + }, + stream); + } + + /** + * @brief Sample the input array without replacement, optionally based on the + * input weight vector for each element in the array + * + * Implementation here is based on the `one-pass sampling` algo described here: + * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf + * + * @note In the sampled array the elements which are picked will always appear + * in the increasing order of their weights as computed using the exponential + * distribution. So, if you're particular about the order (for eg. array + * permutations), then this might not be the right choice! + * + * @tparam DataT data type + * @tparam WeightsT weights type + * @tparam IdxT index type + * @param out output sampled array (of length 'sampledLen') + * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass + * a nullptr if this is not required. + * @param in input array to be sampled (of length 'len') + * @param wts weights array (of length 'len'). Pass a nullptr if uniform + * sampling is desired + * @param sampledLen output sampled array length + * @param len input array length + * @param allocator device allocator for allocating any workspace required + * @param stream cuda stream + */ + template + void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, + IdxT *outIdx, const DataT *in, + const WeightsT *wts, IdxT sampledLen, IdxT len, + cudaStream_t stream) { + ASSERT(sampledLen <= len, + "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + + std::shared_ptr allocator = + handle.get_device_allocator(); + + raft::mr::device::buffer expWts(allocator, stream, len); + raft::mr::device::buffer sortedWts(allocator, stream, len); + raft::mr::device::buffer inIdx(allocator, stream, len); + raft::mr::device::buffer outIdxBuff(allocator, stream, len); + auto *inIdxPtr = inIdx.data(); + // generate modified weights + custom_distribution( + expWts.data(), len, + [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { + inIdxPtr[idx] = idx; + constexpr WeightsT one = (WeightsT)1.0; + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { + return exp / wts[idx]; + } + return exp; + }, + stream); + ///@todo: use a more efficient partitioning scheme instead of full sort + // sort the array and pick the top sampledLen items + IdxT *outIdxPtr = outIdxBuff.data(); + raft::mr::device::buffer workspace(allocator, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, + (int)len, stream); + if (outIdx != nullptr) { + CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, + cudaMemcpyDeviceToDevice, stream)); + } + scatter(out, in, outIdxPtr, sampledLen, stream); + } + + /** + * @brief Core method to generate a pdf based on the cdf that is defined in + * the input device lambda + * + * @tparam OutType output type + * @tparam MathType type on which arithmetic is done + * @tparam LenTyp index type + * @tparam Lambda device lambda (or operator) + * + * @param[out] ptr output buffer [on device] [len = len] + * @param[in] len number of elements to be generated + * @param[in] randOp the device lambda or operator + * @param[in] stream cuda stream + * @{ + */ + template + void custom_distribution(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { + randImpl( + offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); + } + template + void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { + rand2Impl( + offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); + } + /** @} */ + + private: + /** generator type */ + GeneratorType type; + /** + * offset is also used to initialize curand state. + * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), + * but is still a large period. + */ + uint64_t offset; + /** number of blocks to launch */ + int nBlocks; + /** next seed generator for device-side RNG */ + std::mt19937_64 gen; + + static const int NumThreads = 256; + + template + uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, + int nThreads, int nBlocks) { + LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); + if (IsNormal && itemsPerThread % 2 == 1) { + ++itemsPerThread; + } + // curand uses 2 32b uint's to generate one double + uint64_t factor = sizeof(Type) / sizeof(float); + if (factor == 0) ++factor; + // Check if there are enough random numbers left in sequence + // If not, then generate new seed and start from zero offset + uint64_t newOffset = offset + LenType(itemsPerThread) * factor; + if (newOffset < offset) { + offset = 0; + seed = gen(); + newOffset = itemsPerThread * factor; + } + return newOffset; + } + + template + void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, + int nThreads, int nBlocks, GeneratorType type, + cudaStream_t stream) { + if (len <= 0) return; + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, + nThreads, nBlocks); + switch (type) { + case GenPhilox: + randKernel + <<>>(seed, offset, ptr, len, randOp); + break; + case GenTaps: + randKernel + <<>>(seed, offset, ptr, len, randOp); + break; + case GenKiss99: + randKernel + <<>>(seed, offset, ptr, len, randOp); + break; + default: + ASSERT(false, "randImpl: Incorrect generator type! %d", type); + }; + CUDA_CHECK(cudaGetLastError()); + offset = newOffset; + } + + template + void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, + int nThreads, int nBlocks, GeneratorType type, + cudaStream_t stream) { + if (len <= 0) return; + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, + nThreads, nBlocks); + switch (type) { + case GenPhilox: + rand2Kernel + <<>>(seed, offset, ptr, len, rand2Op); + break; + case GenTaps: + rand2Kernel + <<>>(seed, offset, ptr, len, rand2Op); + break; + case GenKiss99: + rand2Kernel + <<>>(seed, offset, ptr, len, rand2Op); + break; + default: + ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + }; + CUDA_CHECK(cudaGetLastError()); + offset = newOffset; + } +}; + +}; // end namespace random +}; // end namespace raft diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh new file mode 100644 index 0000000000..d44c6f018b --- /dev/null +++ b/cpp/include/raft/random/rng_impl.cuh @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace random { +namespace detail { + +/** Philox-based random number generator */ +// Courtesy: Jakub Szuppe +struct PhiloxGenerator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed random seed (can be same across all threads) + * @param subsequence as found in curand docs + * @param offset as found in curand docs + */ + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + curand_init(seed, subsequence, offset, &state); + } + + /** + * @defgroup NextRand Generate the next random number + * @{ + */ + DI void next(float& ret) { ret = curand_uniform(&(this->state)); } + DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } + DI void next(uint32_t& ret) { ret = curand(&(this->state)); } + DI void next(uint64_t& ret) { + uint32_t a, b; + next(a); + next(b); + ret = (uint64_t)a | ((uint64_t)b << 32); + } + DI void next(int32_t& ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t& ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** the state for RNG */ + curandStatePhilox4_32_10_t state; +}; + +/** LFSR taps-filter for generating random numbers. */ +// Courtesy: Vinay Deshpande +struct TapsGenerator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed the seed (can be same across all threads) + * @param subsequence unused + * @param offset unused + */ + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + uint64_t stride = blockDim.x * gridDim.x; + delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; + stride *= blockDim.y * gridDim.y; + delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride; + state = seed + delta + 1; + } + + /** + * @defgroup NextRand Generate the next random number + * @{ + */ + template + DI void next(Type& ret) { + constexpr double ULL_LARGE = 1.8446744073709551614e19; + uint64_t val; + next(val); + ret = static_cast(val); + ret /= static_cast(ULL_LARGE); + } + DI void next(uint64_t& ret) { + constexpr uint64_t TAPS = 0x8000100040002000ULL; + constexpr int ROUNDS = 128; + for (int i = 0; i < ROUNDS; i++) + state = (state >> 1) ^ (-(state & 1ULL) & TAPS); + ret = state; + } + DI void next(uint32_t& ret) { + uint64_t val; + next(val); + ret = (uint32_t)val; + } + DI void next(int32_t& ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t& ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** the state for RNG */ + uint64_t state; +}; + +/** Kiss99-based random number generator */ + +struct Kiss99Generator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed the seed (can be same across all threads) + * @param subsequence unused + * @param offset unused + */ + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + initKiss99(seed); + } + + /** + * @defgroup NextRand Generate the next random number + * @{ + */ + template + DI void next(Type& ret) { + constexpr double U_LARGE = 4.294967295e9; + uint32_t val; + next(val); + ret = static_cast(val); + ret /= static_cast(U_LARGE); + } + DI void next(uint32_t& ret) { + uint32_t MWC; + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); + MWC = ((z << 16) + w); + jsr ^= (jsr << 17); + jsr ^= (jsr >> 13); + jsr ^= (jsr << 5); + jcong = 69069 * jcong + 1234567; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; + } + DI void next(uint64_t& ret) { + uint32_t a, b; + next(a); + next(b); + ret = (uint64_t)a | ((uint64_t)b << 32); + } + DI void next(int32_t& ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t& ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** one of the kiss99 states */ + uint32_t z; + /** one of the kiss99 states */ + uint32_t w; + /** one of the kiss99 states */ + uint32_t jsr; + /** one of the kiss99 states */ + uint32_t jcong; + + // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower + // 128 bits. It uses 32-bit wide multiply only. + DI void mulByFnv1a128Prime(uint32_t* h) { + typedef union { + uint32_t u32[2]; + uint64_t u64[1]; + } words64; + + // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0 + // Here p0 = 315, p2 = 16777216, p1 = p3 = 0 + const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216); + // Partial products + words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2; + + h0p0.u64[0] = uint64_t(h[0]) * p0; + h1p0.u64[0] = uint64_t(h[1]) * p0; + h2p0.u64[0] = uint64_t(h[2]) * p0; + h0p2.u64[0] = uint64_t(h[0]) * p2; + h3p0.u64[0] = uint64_t(h[3]) * p0; + h1p2.u64[0] = uint64_t(h[1]) * p2; + + // h_n[0] = LO(h[0]*p[0]); + // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]); + // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); + // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); + uint32_t carry = 0; + h[0] = h0p0.u32[0]; + + h[1] = h0p0.u32[1] + h1p0.u32[0]; + carry = h[1] < h0p0.u32[1] ? 1 : 0; + + h[2] = h1p0.u32[1] + carry; + carry = h[2] < h1p0.u32[1] ? 1 : 0; + h[2] += h2p0.u32[0]; + carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; + h[2] += h0p2.u32[0]; + carry = h[2] < h0p2.u32[0] ? carry + 1 : carry; + + h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry; + return; + } + + DI void fnv1a128(uint32_t* hash, uint32_t txt) { + hash[0] ^= (txt >> 0) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 8) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 16) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 24) & 0xFF; + mulByFnv1a128Prime(hash); + } + + DI void initKiss99(uint64_t seed) { + // Initialize hash to 128-bit FNV1a basis + uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; + + // Digest threadIdx, blockIdx and seed + fnv1a128(hash, threadIdx.x); + fnv1a128(hash, threadIdx.y); + fnv1a128(hash, threadIdx.z); + fnv1a128(hash, blockIdx.x); + fnv1a128(hash, blockIdx.y); + fnv1a128(hash, blockIdx.z); + fnv1a128(hash, uint32_t(seed)); + fnv1a128(hash, uint32_t(seed >> 32)); + + // Initialize KISS99 state with hash + z = hash[0]; + w = hash[1]; + jsr = hash[2]; + jcong = hash[3]; + } +}; + +/** + * @brief generator-agnostic way of generating random numbers + * @tparam GenType the generator object that expose 'next' method + */ +template +struct Generator { + DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) + : gen(seed, subsequence, offset) {} + + template + DI void next(Type& ret) { + gen.next(ret); + } + + private: + /** the actual generator */ + GenType gen; +}; + +}; // end namespace detail +}; // end namespace random +}; // end namespace raft diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh new file mode 100644 index 0000000000..8691cabc85 --- /dev/null +++ b/cpp/include/raft/stats/mean.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace stats { + +///@todo: ColsPerBlk has been tested only for 32! +template +__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { + const int RowsPerBlkPerIter = TPB / ColsPerBlk; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; + for (IdxType i = rowId; i < N; i += stride) + thread_data += (colId < D) ? data[i * D + colId] : Type(0); + __shared__ Type smu[ColsPerBlk]; + if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); + __syncthreads(); + raft::myAtomicAdd(smu + thisColId, thread_data); + __syncthreads(); + if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]); +} + +template +__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Type thread_data = Type(0); + IdxType colStart = N * blockIdx.x; + for (IdxType i = threadIdx.x; i < N; i += TPB) { + IdxType idx = colStart + i; + thread_data += data[idx]; + } + Type acc = BlockReduce(temp_storage).Sum(thread_data); + if (threadIdx.x == 0) { + mu[blockIdx.x] = acc / N; + } +} + +/** + * @brief Compute mean of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type: the data type + * @tparam IdxType Integer type used to for addressing + * @param mu: the output mean vector + * @param data: the input matrix + * @param D: number of columns of data + * @param N: number of rows of data + * @param sample: whether to evaluate sample mean or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor: whether the input data is row or col major + * @param stream: cuda stream + */ +template +void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, + bool rowMajor, cudaStream_t stream) { + static const int TPB = 256; + if (rowMajor) { + static const int RowsPerThread = 4; + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); + CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); + meanKernelRowMajor + <<>>(mu, data, D, N); + CUDA_CHECK(cudaPeekAtLastError()); + Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); + raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); + } else { + meanKernelColMajor + <<>>(mu, data, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh new file mode 100644 index 0000000000..04934d4388 --- /dev/null +++ b/cpp/include/raft/stats/mean_center.cuh @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace stats { + +/** + * @brief Center the input matrix wrt its mean + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output mean-centered matrix + * @param data input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether to broadcast vector along rows or columns + * @param stream cuda stream where to launch work + */ +template +void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { + raft::linalg::matrixVectorOp( + out, data, mu, D, N, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, stream); +} + +/** + * @brief Add the input matrix wrt its mean + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output mean-added matrix + * @param data input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether to broadcast vector along rows or columns + * @param stream cuda stream where to launch work + */ +template +void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { + raft::linalg::matrixVectorOp( + out, data, mu, D, N, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, stream); +} + +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh new file mode 100644 index 0000000000..f12c633829 --- /dev/null +++ b/cpp/include/raft/stats/stddev.cuh @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace stats { + +///@todo: ColPerBlk has been tested only for 32! +template +__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, + IdxType N) { + const int RowsPerBlkPerIter = TPB / ColsPerBlk; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; + for (IdxType i = rowId; i < N; i += stride) { + Type val = (colId < D) ? data[i * D + colId] : Type(0); + thread_data += val * val; + } + __shared__ Type sstd[ColsPerBlk]; + if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0); + __syncthreads(); + raft::myAtomicAdd(sstd + thisColId, thread_data); + __syncthreads(); + if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(std + colId, sstd[thisColId]); +} + +template +__global__ void stddevKernelColMajor(Type *std, const Type *data, + const Type *mu, IdxType D, IdxType N) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Type thread_data = Type(0); + IdxType colStart = N * blockIdx.x; + Type m = mu[blockIdx.x]; + for (IdxType i = threadIdx.x; i < N; i += TPB) { + IdxType idx = colStart + i; + Type diff = data[idx] - m; + thread_data += diff * diff; + } + Type acc = BlockReduce(temp_storage).Sum(thread_data); + if (threadIdx.x == 0) { + std[blockIdx.x] = raft::mySqrt(acc / N); + } +} + +template +__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, + IdxType D, IdxType N) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Type thread_data = Type(0); + IdxType colStart = N * blockIdx.x; + Type m = mu[blockIdx.x]; + for (IdxType i = threadIdx.x; i < N; i += TPB) { + IdxType idx = colStart + i; + Type diff = data[idx] - m; + thread_data += diff * diff; + } + Type acc = BlockReduce(temp_storage).Sum(thread_data); + if (threadIdx.x == 0) { + var[blockIdx.x] = acc / N; + } +} + +/** + * @brief Compute stddev of the input matrix + * + * Stddev operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param std the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { + static const int TPB = 256; + if (rowMajor) { + static const int RowsPerThread = 4; + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); + CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); + stddevKernelRowMajor + <<>>(std, data, D, N); + Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); + raft::linalg::binaryOp( + std, std, mu, D, + [ratio] __device__(Type a, Type b) { + return raft::mySqrt(a * ratio - b * b); + }, + stream); + } else { + stddevKernelColMajor + <<>>(std, data, mu, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Compute variance of the input matrix + * + * Variance operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param var the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { + static const int TPB = 256; + if (rowMajor) { + static const int RowsPerThread = 4; + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); + CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); + stddevKernelRowMajor + <<>>(var, data, D, N); + Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); + raft::linalg::binaryOp( + var, var, mu, D, + [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + } else { + varsKernelColMajor + <<>>(var, data, mu, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh new file mode 100644 index 0000000000..5f8416c7e2 --- /dev/null +++ b/cpp/include/raft/stats/sum.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace stats { + +///@todo: ColsPerBlk has been tested only for 32! +template +__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { + const int RowsPerBlkPerIter = TPB / ColsPerBlk; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; + for (IdxType i = rowId; i < N; i += stride) + thread_data += (colId < D) ? data[i * D + colId] : Type(0); + __shared__ Type smu[ColsPerBlk]; + if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); + __syncthreads(); + raft::myAtomicAdd(smu + thisColId, thread_data); + __syncthreads(); + if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]); +} + +template +__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Type thread_data = Type(0); + IdxType colStart = N * blockIdx.x; + for (IdxType i = threadIdx.x; i < N; i += TPB) { + IdxType idx = colStart + i; + thread_data += data[idx]; + } + Type acc = BlockReduce(temp_storage).Sum(thread_data); + if (threadIdx.x == 0) { + mu[blockIdx.x] = acc; + } +} + +/** + * @brief Compute sum of the input matrix + * + * Sum operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param output the output mean vector + * @param input the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, + cudaStream_t stream) { + static const int TPB = 256; + if (rowMajor) { + static const int RowsPerThread = 4; + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); + CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); + sumKernelRowMajor + <<>>(output, input, D, N); + } else { + sumKernelColMajor + <<>>(output, input, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh new file mode 100644 index 0000000000..1829fc0351 --- /dev/null +++ b/cpp/include/raft/vectorized.cuh @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "cuda_utils.cuh" + +namespace raft { + +template +struct IOType {}; +template <> +struct IOType { + static_assert(sizeof(bool) == sizeof(int8_t), + "IOType bool size assumption failed"); + typedef int8_t Type; +}; +template <> +struct IOType { + typedef int16_t Type; +}; +template <> +struct IOType { + typedef int32_t Type; +}; +template <> +struct IOType { + typedef int2 Type; +}; +template <> +struct IOType { + typedef int4 Type; +}; +template <> +struct IOType { + typedef int8_t Type; +}; +template <> +struct IOType { + typedef int16_t Type; +}; +template <> +struct IOType { + typedef int32_t Type; +}; +template <> +struct IOType { + typedef int2 Type; +}; +template <> +struct IOType { + typedef int4 Type; +}; +template <> +struct IOType { + typedef uint8_t Type; +}; +template <> +struct IOType { + typedef uint16_t Type; +}; +template <> +struct IOType { + typedef uint32_t Type; +}; +template <> +struct IOType { + typedef uint2 Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef int16_t Type; +}; +template <> +struct IOType { + typedef int32_t Type; +}; +template <> +struct IOType { + typedef int2 Type; +}; +template <> +struct IOType { + typedef int4 Type; +}; +template <> +struct IOType { + typedef uint16_t Type; +}; +template <> +struct IOType { + typedef uint32_t Type; +}; +template <> +struct IOType { + typedef uint2 Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType<__half, 1> { + typedef __half Type; +}; +template <> +struct IOType<__half, 2> { + typedef __half2 Type; +}; +template <> +struct IOType<__half, 4> { + typedef uint2 Type; +}; +template <> +struct IOType<__half, 8> { + typedef uint4 Type; +}; +template <> +struct IOType<__half2, 1> { + typedef __half2 Type; +}; +template <> +struct IOType<__half2, 2> { + typedef uint2 Type; +}; +template <> +struct IOType<__half2, 4> { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef int32_t Type; +}; +template <> +struct IOType { + typedef uint2 Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef uint32_t Type; +}; +template <> +struct IOType { + typedef uint2 Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef float Type; +}; +template <> +struct IOType { + typedef float2 Type; +}; +template <> +struct IOType { + typedef float4 Type; +}; +template <> +struct IOType { + typedef int64_t Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef uint64_t Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef unsigned long long Type; +}; +template <> +struct IOType { + typedef uint4 Type; +}; +template <> +struct IOType { + typedef double Type; +}; +template <> +struct IOType { + typedef double2 Type; +}; + +/** + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ +template +struct TxN_t { + /** underlying math data type */ + typedef math_ math_t; + /** internal storage data type */ + typedef typename IOType::Type io_t; + + /** defines the number of 'math_t' types stored by this struct */ + static const int Ratio = veclen_; + + union { + /** the vectorized data that is used for subsequent operations */ + math_t data[Ratio]; + /** internal data used to ensure vectorized loads/stores */ + io_t internal; + } val; + + ///@todo: add default constructor + + /** + * @brief Fill the contents of this structure with a constant value + * @param _val the constant to be filled + */ + DI void fill(math_t _val) { +#pragma unroll + for (int i = 0; i < Ratio; ++i) { + val.data[i] = _val; + } + } + + ///@todo: how to handle out-of-bounds!!? + + /** + * @defgroup LoadsStores Global/Shared vectored loads or stores + * + * @brief Perform vectored loads/stores on this structure + * @tparam idx_t index data type + * @param ptr base pointer from where to load (or store) the data. It must + * be aligned to 'sizeof(io_t)'! + * @param idx the offset from the base pointer which will be loaded + * (or stored) by the current thread. This must be aligned to 'Ratio'! + * + * @note: In case of loads, after a successful execution, the val.data will + * be populated with the desired data loaded from the pointer location. In + * case of stores, the data in the val.data will be stored to that location. + * @{ + */ + template + DI void load(const math_t *ptr, idx_t idx) { + const io_t *bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); + } + + template + DI void load(math_t *ptr, idx_t idx) { + io_t *bptr = reinterpret_cast(&ptr[idx]); + val.internal = *bptr; + } + + template + DI void store(math_t *ptr, idx_t idx) { + io_t *bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; + } + /** @} */ +}; + +/** this is just to keep the compiler happy! */ +template +struct TxN_t { + typedef math_ math_t; + static const int Ratio = 1; + + union { + math_t data[1]; + } val; + + DI void fill(math_t _val) {} + template + DI void load(const math_t *ptr, idx_t idx) {} + template + DI void load(math_t *ptr, idx_t idx) {} + template + DI void store(math_t *ptr, idx_t idx) {} +}; + +} // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu new file mode 100644 index 0000000000..2fc9d4e30f --- /dev/null +++ b/cpp/test/linalg/add.cu @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "add.cuh" + +namespace raft { +namespace linalg { + +template +class AddTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(in1, len); + raft::allocate(in2, len); + raft::allocate(out_ref, len); + raft::allocate(out, len); + r.uniform(in1, len, InT(-1.0), InT(1.0), stream); + r.uniform(in2, len, InT(-1.0), InT(1.0), stream); + naiveAddElem(out_ref, in1, in2, len); + add(out, in1, in2, len, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(in1)); + CUDA_CHECK(cudaFree(in2)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void compare() { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); + } + + protected: + AddInputs params; + InT *in1, *in2; + OutT *out_ref, *out; + cudaStream_t stream; +}; + +const std::vector> inputsf = { + {0.000001f, 1024 * 1024, 1234ULL}, + {0.000001f, 1024 * 1024 + 2, 1234ULL}, + {0.000001f, 1024 * 1024 + 1, 1234ULL}, +}; +typedef AddTest AddTestF; +TEST_P(AddTestF, Result) { compare(); } +INSTANTIATE_TEST_SUITE_P(AddTests, AddTestF, ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.00000001, 1024 * 1024, 1234ULL}, + {0.00000001, 1024 * 1024 + 2, 1234ULL}, + {0.00000001, 1024 * 1024 + 1, 1234ULL}, +}; +typedef AddTest AddTestD; +TEST_P(AddTestD, Result) { compare(); } +INSTANTIATE_TEST_SUITE_P(AddTests, AddTestD, ::testing::ValuesIn(inputsd)); + +const std::vector> inputsfd = { + {0.00000001, 1024 * 1024, 1234ULL}, + {0.00000001, 1024 * 1024 + 2, 1234ULL}, + {0.00000001, 1024 * 1024 + 1, 1234ULL}, +}; +typedef AddTest AddTestFD; +TEST_P(AddTestFD, Result) { compare(); } +INSTANTIATE_TEST_SUITE_P(AddTests, AddTestFD, ::testing::ValuesIn(inputsfd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh new file mode 100644 index 0000000000..137419758f --- /dev/null +++ b/cpp/test/linalg/add.cuh @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { + +template +__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, + int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = OutT(in1[idx] + in2[idx]); + } +} + +template +void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveAddElemKernel<<>>(out, in1, in2, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct AddInputs { + OutT tolerance; + int len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const AddInputs &dims) { + return os; +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu new file mode 100644 index 0000000000..357ade7388 --- /dev/null +++ b/cpp/test/linalg/binary_op.cu @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "binary_op.cuh" + +namespace raft { +namespace linalg { + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { + binaryOp( + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, + stream); +} + +template +class BinaryOpTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam< + BinaryOpInputs>::GetParam(); + raft::random::Rng r(params.seed); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + IdxType len = params.len; + allocate(in1, len); + allocate(in2, len); + allocate(out_ref, len); + allocate(out, len); + r.uniform(in1, len, InType(-1.0), InType(1.0), stream); + r.uniform(in2, len, InType(-1.0), InType(1.0), stream); + naiveAdd(out_ref, in1, in2, len); + binaryOpLaunch(out, in1, in2, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in1)); + CUDA_CHECK(cudaFree(in2)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + BinaryOpInputs params; + InType *in1, *in2; + OutType *out_ref, *out; +}; + +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestF_i32; +TEST_P(BinaryOpTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, + ::testing::ValuesIn(inputsf_i32)); + +const std::vector> inputsf_i64 = { + {0.000001f, 1024 * 1024, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestF_i64; +TEST_P(BinaryOpTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, + ::testing::ValuesIn(inputsf_i64)); + +const std::vector> inputsf_i32_d = { + {0.000001f, 1024 * 1024, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestF_i32_D; +TEST_P(BinaryOpTestF_i32_D, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, + ::testing::ValuesIn(inputsf_i32_d)); + +const std::vector> inputsd_i32 = { + {0.00000001, 1024 * 1024, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestD_i32; +TEST_P(BinaryOpTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, + ::testing::ValuesIn(inputsd_i32)); + +const std::vector> inputsd_i64 = { + {0.00000001, 1024 * 1024, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestD_i64; +TEST_P(BinaryOpTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, + ::testing::ValuesIn(inputsd_i64)); + +} // namespace linalg +} // namespace raft diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh new file mode 100644 index 0000000000..fd8ed6dd1e --- /dev/null +++ b/cpp/test/linalg/binary_op.cuh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +__global__ void naiveAddKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len) { + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); + if (idx < len) { + out[idx] = static_cast(in1[idx] + in2[idx]); + } +} + +template +void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { + static const IdxType TPB = 64; + IdxType nblks = raft::ceildiv(len, TPB); + naiveAddKernel<<>>(out, in1, in2, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct BinaryOpInputs { + InType tolerance; + IdxType len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const BinaryOpInputs &d) { + return os; +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu new file mode 100644 index 0000000000..e45f5651b4 --- /dev/null +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" +#include "reduce.cuh" + +namespace raft { +namespace linalg { + +template +struct coalescedReductionInputs { + T tolerance; + int rows, cols; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const coalescedReductionInputs &dims) { + return os; +} + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, + cudaStream_t stream, bool inplace = false) { + coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, + [] __device__(T in, int i) { return in * in; }); +} + +template +class coalescedReductionTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + int len = rows * cols; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(data, len); + raft::allocate(dots_exp, rows); + raft::allocate(dots_act, rows); + r.uniform(data, len, T(-1.0), T(1.0), stream); + naiveCoalescedReduction(dots_exp, data, cols, rows, stream); + + // Perform reduction with default inplace = false first + coalescedReductionLaunch(dots_act, data, cols, rows, stream); + // Add to result with inplace = true next + coalescedReductionLaunch(dots_act, data, cols, rows, stream, true); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(dots_exp)); + CUDA_CHECK(cudaFree(dots_act)); + } + + protected: + coalescedReductionInputs params; + T *data, *dots_exp, *dots_act; +}; + +const std::vector> inputsf = { + {0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; + +const std::vector> inputsd = { + {0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; + +typedef coalescedReductionTest coalescedReductionTestF; +TEST_P(coalescedReductionTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); +} + +typedef coalescedReductionTest coalescedReductionTestD; +TEST_P(coalescedReductionTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu new file mode 100644 index 0000000000..2396558939 --- /dev/null +++ b/cpp/test/linalg/divide.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +template +__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, + int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = in[idx] / scalar; + } +} + +template +void naiveDivide(Type *out, const Type *in, Type scalar, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveDivideKernel<<>>(out, in, scalar, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +class DivideTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + raft::allocate(in, len); + raft::allocate(out_ref, len); + raft::allocate(out, len); + r.uniform(in, len, T(-1.0), T(1.0), stream); + naiveDivide(out_ref, in, params.scalar, len, stream); + divideScalar(out, in, params.scalar, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + UnaryOpInputs params; + T *in, *out_ref, *out; +}; + +const std::vector> inputsf = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef DivideTest DivideTestF; +TEST_P(DivideTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, + ::testing::ValuesIn(inputsf)); + +typedef DivideTest DivideTestD; +const std::vector> inputsd = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu new file mode 100644 index 0000000000..159d288174 --- /dev/null +++ b/cpp/test/linalg/eig.cu @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct EigInputs { + T tolerance; + int len; + int n_row; + int n_col; + unsigned long long int seed; + int n; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { + return os; +} + +template +class EigTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + raft::handle_t handle; + stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + + raft::allocate(cov_matrix, len); + T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, + 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + ASSERT(len == 16, "This test only works with 4x4 matrices!"); + raft::update_device(cov_matrix, cov_matrix_h, len, stream); + + raft::allocate(eig_vectors, len); + raft::allocate(eig_vals, params.n_col); + raft::allocate(eig_vectors_jacobi, len); + raft::allocate(eig_vals_jacobi, params.n_col); + + T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, + 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, + 0.4874, 0.5123, 0.5123, 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + + raft::allocate(eig_vectors_ref, len); + raft::allocate(eig_vals_ref, params.n_col); + + raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); + raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); + + eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, + stream); + + T tol = 1.e-7; + int sweeps = 15; + eigJacobi(handle, cov_matrix, params.n_row, params.n_col, + eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps); + + // test code for comparing two methods + len = params.n * params.n; + raft::allocate(cov_matrix_large, len); + raft::allocate(eig_vectors_large, len); + raft::allocate(eig_vectors_jacobi_large, len); + raft::allocate(eig_vals_large, params.n); + raft::allocate(eig_vals_jacobi_large, params.n); + + r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); + + eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, + eig_vals_large, stream); + eigJacobi(handle, cov_matrix_large, params.n, params.n, + eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol, + sweeps); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(cov_matrix)); + CUDA_CHECK(cudaFree(eig_vectors)); + CUDA_CHECK(cudaFree(eig_vectors_jacobi)); + CUDA_CHECK(cudaFree(eig_vals)); + CUDA_CHECK(cudaFree(eig_vals_jacobi)); + CUDA_CHECK(cudaFree(eig_vectors_ref)); + CUDA_CHECK(cudaFree(eig_vals_ref)); + } + + protected: + EigInputs params; + T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, + *eig_vals_jacobi, *eig_vals_ref; + + T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, + *eig_vals_large, *eig_vals_jacobi_large; + + cudaStream_t stream; +}; + +const std::vector> inputsf2 = { + {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; + +const std::vector> inputsd2 = { + {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; + +typedef EigTest EigTestValF; +TEST_P(EigTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestValD; +TEST_P(EigTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecF; +TEST_P(EigTestVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecD; +TEST_P(EigTestVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestValJacobiF; +TEST_P(EigTestValJacobiF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestValJacobiD; +TEST_P(EigTestValJacobiD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecJacobiF; +TEST_P(EigTestVecJacobiF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecJacobiD; +TEST_P(EigTestVecJacobiD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecCompareF; +TEST_P(EigTestVecCompareF, Result) { + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigTest EigTestVecCompareD; +TEST_P(EigTestVecCompareD, Result) { + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValD, ::testing::ValuesIn(inputsd2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, + ::testing::ValuesIn(inputsd2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, + ::testing::ValuesIn(inputsd2)); + +} // namespace linalg +} // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu new file mode 100644 index 0000000000..b3980f281d --- /dev/null +++ b/cpp/test/linalg/eig_sel.cu @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if CUDART_VERSION >= 10010 + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct EigSelInputs { + T tolerance; + int len; + int n_row; + int n_col; + unsigned long long int seed; + int n; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { + return os; +} + +template +class EigSelTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + raft::handle_t handle; + stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + int len = params.len; + + raft::allocate(cov_matrix, len); + T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, + 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + ASSERT(len == 16, "This test only works with 4x4 matrices!"); + raft::update_device(cov_matrix, cov_matrix_h, len, stream); + + raft::allocate(eig_vectors, 12); + raft::allocate(eig_vals, params.n_col); + + T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, + -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + + raft::allocate(eig_vectors_ref, 12); + raft::allocate(eig_vals_ref, params.n_col); + + raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); + raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); + + eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, + eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(cov_matrix)); + CUDA_CHECK(cudaFree(eig_vectors)); + CUDA_CHECK(cudaFree(eig_vals)); + CUDA_CHECK(cudaFree(eig_vectors_ref)); + CUDA_CHECK(cudaFree(eig_vals_ref)); + } + + protected: + EigSelInputs params; + T *cov_matrix, *eig_vectors, *eig_vectors_ref, *eig_vals, *eig_vals_ref; + + cudaStream_t stream; +}; + +const std::vector> inputsf2 = { + {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; + +const std::vector> inputsd2 = { + {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; + +typedef EigSelTest EigSelTestValF; +TEST_P(EigSelTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigSelTest EigSelTestValD; +TEST_P(EigSelTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigSelTest EigSelTestVecF; +TEST_P(EigSelTestVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef EigSelTest EigSelTestVecD; +TEST_P(EigSelTestVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, + raft::CompareApproxAbs(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, + ::testing::ValuesIn(inputsd2)); + +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, + ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft + +#endif diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu new file mode 100644 index 0000000000..572951c557 --- /dev/null +++ b/cpp/test/linalg/eltwise.cu @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +//// Testing unary ops + +template +__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, + int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = scalar * in[idx]; + } +} + +template +void naiveScale(Type *out, const Type *in, Type scalar, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel<<>>(out, in, scalar, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct ScalarMultiplyInputs { + T tolerance; + int len; + T scalar; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const ScalarMultiplyInputs &dims) { + return os; +} + +template +class ScalarMultiplyTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + T scalar = params.scalar; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + allocate(in, len); + allocate(out_ref, len); + allocate(out, len); + r.uniform(in, len, T(-1.0), T(1.0), stream); + naiveScale(out_ref, in, scalar, len, stream); + scalarMultiply(out, in, scalar, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + ScalarMultiplyInputs params; + T *in, *out_ref, *out; +}; + +const std::vector> inputsf1 = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; + +const std::vector> inputsd1 = { + {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; + +typedef ScalarMultiplyTest ScalarMultiplyTestF; +TEST_P(ScalarMultiplyTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} + +typedef ScalarMultiplyTest ScalarMultiplyTestD; +TEST_P(ScalarMultiplyTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, + ::testing::ValuesIn(inputsf1)); + +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, + ::testing::ValuesIn(inputsd1)); + +//// Testing binary ops + +template +__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, + int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = in1[idx] + in2[idx]; + } +} + +template +void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveAddKernel<<>>(out, in1, in2, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct EltwiseAddInputs { + T tolerance; + int len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const EltwiseAddInputs &dims) { + return os; +} + +template +class EltwiseAddTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + int len = params.len; + allocate(in1, len); + allocate(in2, len); + allocate(out_ref, len); + allocate(out, len); + r.uniform(in1, len, T(-1.0), T(1.0), stream); + r.uniform(in2, len, T(-1.0), T(1.0), stream); + naiveAdd(out_ref, in1, in2, len, stream); + eltwiseAdd(out, in1, in2, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in1)); + CUDA_CHECK(cudaFree(in2)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + EltwiseAddInputs params; + T *in1, *in2, *out_ref, *out; +}; + +const std::vector> inputsf2 = { + {0.000001f, 1024 * 1024, 1234ULL}}; + +const std::vector> inputsd2 = { + {0.00000001, 1024 * 1024, 1234ULL}}; + +typedef EltwiseAddTest EltwiseAddTestF; +TEST_P(EltwiseAddTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} + +typedef EltwiseAddTest EltwiseAddTestD; +TEST_P(EltwiseAddTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, + ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu new file mode 100644 index 0000000000..cecfc5eb8e --- /dev/null +++ b/cpp/test/linalg/gemm_layout.cu @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct GemmLayoutInputs { + int M; + int N; + int K; + bool zLayout; + bool xLayout; + bool yLayout; + unsigned long long int seed; +}; + +// Reference GEMM implementation. +template +__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, + bool isZColMajor, bool isXColMajor, + bool isYColMajor) { + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + int tidy = blockIdx.y * blockDim.y + threadIdx.y; + + for (int m = tidy; m < M; m += (blockDim.y * gridDim.y)) { + for (int n = tidx; n < N; n += (blockDim.x * gridDim.x)) { + T temp = T(0.0); + for (int k = 0; k < K; k++) { + int xIndex = isXColMajor ? m + k * M : m * K + k; + int yIndex = isYColMajor ? k + n * K : k * N + n; + temp += X[xIndex] * Y[yIndex]; + } + int zIndex = isZColMajor ? m + n * M : m * N + n; + Z[zIndex] = temp; + } + } +} + +template +class GemmLayoutTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + + raft::handle_t handle; + cudaStream_t stream = handle.get_stream(); + + raft::random::Rng r(params.seed); + + // We compute Z = X * Y and compare against reference result + // Dimensions of X : M x K + // Dimensions of Y : K x N + // Dimensions of Z : M x N + + T *X = NULL; // Argument X + T *Y = NULL; // Argument Y + + size_t xElems = params.M * params.K; + size_t yElems = params.K * params.N; + size_t zElems = params.M * params.N; + + CUDA_CHECK(cudaMalloc(&X, xElems * sizeof(T))); + CUDA_CHECK(cudaMalloc(&Y, yElems * sizeof(T))); + CUDA_CHECK(cudaMalloc(&refZ, zElems * sizeof(T))); + CUDA_CHECK(cudaMalloc(&Z, zElems * sizeof(T))); + + r.uniform(X, xElems, T(-10.0), T(10.0), stream); + r.uniform(Y, yElems, T(-10.0), T(10.0), stream); + + dim3 blocks(raft::ceildiv(params.M, 128), + raft::ceildiv(params.N, 4), 1); + dim3 threads(128, 4, 1); + + naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, + params.zLayout, params.xLayout, + params.yLayout); + + gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, + params.xLayout, params.yLayout, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(refZ)); + CUDA_CHECK(cudaFree(Z)); + } + + protected: + GemmLayoutInputs params; + T *refZ = NULL; // Reference result for comparison + T *Z = NULL; // Computed result +}; + +const std::vector> inputsf = { + {80, 70, 80, true, true, true, 76433ULL}, + {80, 100, 40, true, true, false, 426646ULL}, + {20, 100, 20, true, false, true, 237703ULL}, + {100, 60, 30, true, false, false, 538004ULL}, + {50, 10, 60, false, true, true, 73012ULL}, + {90, 90, 30, false, true, false, 538147ULL}, + {30, 100, 10, false, false, true, 412352ULL}, + {40, 80, 100, false, false, false, 297941ULL}}; + +const std::vector> inputsd = { + {10, 70, 40, true, true, true, 535648ULL}, + {30, 30, 30, true, true, false, 956681ULL}, + {70, 80, 50, true, false, true, 875083ULL}, + {80, 90, 70, true, false, false, 50744ULL}, + {90, 90, 30, false, true, true, 506321ULL}, + {40, 100, 70, false, true, false, 638418ULL}, + {80, 50, 30, false, false, true, 701529ULL}, + {50, 80, 60, false, false, false, 893038ULL}}; + +typedef GemmLayoutTest GemmLayoutTestF; +TEST_P(GemmLayoutTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, + raft::CompareApprox(1e-4))); +} + +typedef GemmLayoutTest GemmLayoutTestD; +TEST_P(GemmLayoutTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, + raft::CompareApprox(1e-6))); +} + +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu new file mode 100644 index 0000000000..adbb339de2 --- /dev/null +++ b/cpp/test/linalg/map_then_reduce.cu @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +__global__ void naiveMapReduceKernel(Type *out, const Type *in, size_t len, + MapOp map) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + raft::myAtomicAdd(out, map(in[idx])); + } +} + +template +void naiveMapReduce(Type *out, const Type *in, size_t len, MapOp map, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel + <<>>(out, in, len, map); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct MapReduceInputs { + T tolerance; + size_t len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { + return os; +} + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void mapReduceLaunch(T *out_ref, T *out, const T *in, size_t len, + cudaStream_t stream) { + auto op = [] __device__(T in) { return in; }; + naiveMapReduce(out_ref, in, len, op, stream); + mapThenSumReduce(out, len, op, 0, in); +} + +template +class MapReduceTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + auto len = params.len; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + allocate(in, len); + allocate(out_ref, len); + allocate(out, len); + r.uniform(in, len, T(-1.0), T(1.0), stream); + mapReduceLaunch(out_ref, out, in, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + MapReduceInputs params; + T *in, *out_ref, *out; +}; + +const std::vector> inputsf = { + {0.001f, 1024 * 1024, 1234ULL}}; +typedef MapReduceTest MapReduceTestF; +TEST_P(MapReduceTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.000001, 1024 * 1024, 1234ULL}}; +typedef MapReduceTest MapReduceTestD; +TEST_P(MapReduceTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu new file mode 100644 index 0000000000..aa46c78b0f --- /dev/null +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" +#include "matrix_vector_op.cuh" + +namespace raft { +namespace linalg { + +template +struct MatVecOpInputs { + T tolerance; + IdxType rows, cols; + bool rowMajor, bcastAlongRows, useTwoVectors; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const MatVecOpInputs &dims) { + return os; +} + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, bool useTwoVectors, + cudaStream_t stream) { + if (useTwoVectors) { + matrixVectorOp( + out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, stream); + } else { + matrixVectorOp( + out, in, vec1, D, N, rowMajor, bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, stream); + } +} + +template +class MatVecOpTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + IdxType N = params.rows, D = params.cols; + IdxType len = N * D; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + allocate(in, len); + allocate(out_ref, len); + allocate(out, len); + IdxType vecLen = params.bcastAlongRows ? D : N; + allocate(vec1, vecLen); + allocate(vec2, vecLen); + r.uniform(in, len, (T)-1.0, (T)1.0, stream); + r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); + r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); + if (params.useTwoVectors) { + naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, + params.bcastAlongRows, (T)1.0); + } else { + naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, + params.bcastAlongRows, (T)1.0); + } + matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, + params.bcastAlongRows, params.useTwoVectors, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(vec1)); + CUDA_CHECK(cudaFree(vec2)); + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(in)); + } + + protected: + MatVecOpInputs params; + T *in, *out, *out_ref, *vec1, *vec2; +}; + +const std::vector> inputsf_i32 = { + {0.00001f, 1024, 32, true, true, false, 1234ULL}, + {0.00001f, 1024, 64, true, true, false, 1234ULL}, + {0.00001f, 1024, 32, true, false, false, 1234ULL}, + {0.00001f, 1024, 64, true, false, false, 1234ULL}, + {0.00001f, 1024, 32, false, true, false, 1234ULL}, + {0.00001f, 1024, 64, false, true, false, 1234ULL}, + {0.00001f, 1024, 32, false, false, false, 1234ULL}, + {0.00001f, 1024, 64, false, false, false, 1234ULL}, + + {0.00001f, 1024, 32, true, true, true, 1234ULL}, + {0.00001f, 1024, 64, true, true, true, 1234ULL}, + {0.00001f, 1024, 32, true, false, true, 1234ULL}, + {0.00001f, 1024, 64, true, false, true, 1234ULL}, + {0.00001f, 1024, 32, false, true, true, 1234ULL}, + {0.00001f, 1024, 64, false, true, true, 1234ULL}, + {0.00001f, 1024, 32, false, false, true, 1234ULL}, + {0.00001f, 1024, 64, false, false, true, 1234ULL}}; +typedef MatVecOpTest MatVecOpTestF_i32; +TEST_P(MatVecOpTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, + ::testing::ValuesIn(inputsf_i32)); + +const std::vector> inputsf_i64 = { + {0.00001f, 2500, 250, false, false, false, 1234ULL}, + {0.00001f, 2500, 250, false, false, true, 1234ULL}}; +typedef MatVecOpTest MatVecOpTestF_i64; +TEST_P(MatVecOpTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, + ::testing::ValuesIn(inputsf_i64)); + +const std::vector> inputsd_i32 = { + {0.0000001, 1024, 32, true, true, false, 1234ULL}, + {0.0000001, 1024, 64, true, true, false, 1234ULL}, + {0.0000001, 1024, 32, true, false, false, 1234ULL}, + {0.0000001, 1024, 64, true, false, false, 1234ULL}, + {0.0000001, 1024, 32, false, true, false, 1234ULL}, + {0.0000001, 1024, 64, false, true, false, 1234ULL}, + {0.0000001, 1024, 32, false, false, false, 1234ULL}, + {0.0000001, 1024, 64, false, false, false, 1234ULL}, + + {0.0000001, 1024, 32, true, true, true, 1234ULL}, + {0.0000001, 1024, 64, true, true, true, 1234ULL}, + {0.0000001, 1024, 32, true, false, true, 1234ULL}, + {0.0000001, 1024, 64, true, false, true, 1234ULL}, + {0.0000001, 1024, 32, false, true, true, 1234ULL}, + {0.0000001, 1024, 64, false, true, true, 1234ULL}, + {0.0000001, 1024, 32, false, false, true, 1234ULL}, + {0.0000001, 1024, 64, false, false, true, 1234ULL}}; +typedef MatVecOpTest MatVecOpTestD_i32; +TEST_P(MatVecOpTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, + ::testing::ValuesIn(inputsd_i32)); + +const std::vector> inputsd_i64 = { + {0.0000001, 2500, 250, false, false, false, 1234ULL}, + {0.0000001, 2500, 250, false, false, true, 1234ULL}}; +typedef MatVecOpTest MatVecOpTestD_i64; +TEST_P(MatVecOpTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, + ::testing::ValuesIn(inputsd_i64)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh new file mode 100644 index 0000000000..69c45c9866 --- /dev/null +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Type scalar) { + IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; + IdxType len = N * D; + IdxType col; + if (rowMajor && bcastAlongRows) { + col = idx % D; + } else if (!rowMajor && !bcastAlongRows) { + col = idx % N; + } else if (rowMajor && !bcastAlongRows) { + col = idx / D; + } else { + col = idx / N; + } + if (idx < len) { + out[idx] = mat[idx] + scalar * vec[col]; + } +} + +template +void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { + static const IdxType TPB = 64; + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, + Type scalar) { + IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; + IdxType len = N * D; + IdxType col; + if (rowMajor && bcastAlongRows) { + col = idx % D; + } else if (!rowMajor && !bcastAlongRows) { + col = idx % N; + } else if (rowMajor && !bcastAlongRows) { + col = idx / D; + } else { + col = idx / N; + } + if (idx < len) { + out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; + } +} + +template +void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, + IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, + Type scalar) { + static const IdxType TPB = 64; + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, + bcastAlongRows, scalar); + CUDA_CHECK(cudaPeekAtLastError()); +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu new file mode 100644 index 0000000000..1d3e753de3 --- /dev/null +++ b/cpp/test/linalg/multiply.cu @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +template +class MultiplyTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + raft::allocate(in, len); + raft::allocate(out_ref, len); + raft::allocate(out, len); + r.uniform(in, len, T(-1.0), T(1.0), stream); + naiveScale(out_ref, in, params.scalar, len, stream); + multiplyScalar(out, in, params.scalar, len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + UnaryOpInputs params; + T *in, *out_ref, *out; +}; + +const std::vector> inputsf = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef MultiplyTest MultiplyTestF; +TEST_P(MultiplyTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, + ::testing::ValuesIn(inputsf)); + +typedef MultiplyTest MultiplyTestD; +const std::vector> inputsd = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu new file mode 100644 index 0000000000..acc25addd0 --- /dev/null +++ b/cpp/test/linalg/norm.cu @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct NormInputs { + T tolerance; + int rows, cols; + NormType type; + bool do_sqrt; + bool rowMajor; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " + << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; + return os; +} + +///// Row-wise norm test definitions +template +__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, + NormType type, bool do_sqrt) { + Type acc = (Type)0; + int rowStart = threadIdx.x + blockIdx.x * blockDim.x; + if (rowStart < N) { + for (int i = 0; i < D; ++i) { + if (type == L2Norm) { + acc += data[rowStart * D + i] * data[rowStart * D + i]; + } else { + acc += raft::myAbs(data[rowStart * D + i]); + } + } + dots[rowStart] = do_sqrt ? raft::mySqrt(acc) : acc; + } +} + +template +void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, + bool do_sqrt, cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel + <<>>(dots, data, D, N, type, do_sqrt); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +class RowNormTest : public ::testing::TestWithParam> { + public: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols, len = rows * cols; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(data, len); + raft::allocate(dots_exp, rows); + raft::allocate(dots_act, rows); + r.uniform(data, len, T(-1.0), T(1.0), stream); + naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, + stream); + if (params.do_sqrt) { + auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; + rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, + fin_op); + } else { + rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); + } + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(dots_exp)); + CUDA_CHECK(cudaFree(dots_act)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + protected: + NormInputs params; + T *data, *dots_exp, *dots_act; + cudaStream_t stream; +}; + +///// Column-wise norm test definitisons +template +__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, + NormType type, bool do_sqrt) { + int colID = threadIdx.x + blockIdx.x * blockDim.x; + if (colID > D) return; //avoid out-of-bounds thread + + Type acc = 0; + for (int i = 0; i < N; i++) { + Type v = data[colID + i * D]; + acc += type == L2Norm ? v * v : raft::myAbs(v); + } + + dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc; +} + +template +void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, + bool do_sqrt, cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel + <<>>(dots, data, D, N, type, do_sqrt); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +class ColNormTest : public ::testing::TestWithParam> { + public: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols, len = rows * cols; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(data, len); + r.uniform(data, len, T(-1.0), T(1.0), stream); + raft::allocate(dots_exp, cols); + raft::allocate(dots_act, cols); + + naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, + stream); + if (params.do_sqrt) { + auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; + colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, + fin_op); + } else { + colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); + } + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(dots_exp)); + CUDA_CHECK(cudaFree(dots_act)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + protected: + NormInputs params; + T *data, *dots_exp, *dots_act; + cudaStream_t stream; +}; + +///// Row- and column-wise tests +const std::vector> inputsf = { + {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00000001, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00000001, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00000001, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00000001, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00000001, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00000001, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00000001, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00000001, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00000001, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00000001, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00000001, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00000001, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00000001, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00000001, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; + +typedef RowNormTest RowNormTestF; +TEST_P(RowNormTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); +} + +typedef RowNormTest RowNormTestD; +TEST_P(RowNormTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, + ::testing::ValuesIn(inputsd)); + +const std::vector> inputscf = { + {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, + {0.00001f, 64, 1024, L1Norm, false, true, 1234ULL}, + {0.00001f, 128, 1024, L1Norm, false, true, 1234ULL}, + {0.00001f, 256, 1024, L1Norm, false, true, 1234ULL}, + {0.00001f, 32, 1024, L2Norm, false, true, 1234ULL}, + {0.00001f, 64, 1024, L2Norm, false, true, 1234ULL}, + {0.00001f, 128, 1024, L2Norm, false, true, 1234ULL}, + {0.00001f, 256, 1024, L2Norm, false, true, 1234ULL}, + + {0.00001f, 32, 1024, L1Norm, true, true, 1234ULL}, + {0.00001f, 64, 1024, L1Norm, true, true, 1234ULL}, + {0.00001f, 128, 1024, L1Norm, true, true, 1234ULL}, + {0.00001f, 256, 1024, L1Norm, true, true, 1234ULL}, + {0.00001f, 32, 1024, L2Norm, true, true, 1234ULL}, + {0.00001f, 64, 1024, L2Norm, true, true, 1234ULL}, + {0.00001f, 128, 1024, L2Norm, true, true, 1234ULL}, + {0.00001f, 256, 1024, L2Norm, true, true, 1234ULL}}; + +const std::vector> inputscd = { + {0.00000001, 32, 1024, L1Norm, false, true, 1234ULL}, + {0.00000001, 64, 1024, L1Norm, false, true, 1234ULL}, + {0.00000001, 128, 1024, L1Norm, false, true, 1234ULL}, + {0.00000001, 256, 1024, L1Norm, false, true, 1234ULL}, + {0.00000001, 32, 1024, L2Norm, false, true, 1234ULL}, + {0.00000001, 64, 1024, L2Norm, false, true, 1234ULL}, + {0.00000001, 128, 1024, L2Norm, false, true, 1234ULL}, + {0.00000001, 256, 1024, L2Norm, false, true, 1234ULL}, + + {0.00000001, 32, 1024, L1Norm, true, true, 1234ULL}, + {0.00000001, 64, 1024, L1Norm, true, true, 1234ULL}, + {0.00000001, 128, 1024, L1Norm, true, true, 1234ULL}, + {0.00000001, 256, 1024, L1Norm, true, true, 1234ULL}, + {0.00000001, 32, 1024, L2Norm, true, true, 1234ULL}, + {0.00000001, 64, 1024, L2Norm, true, true, 1234ULL}, + {0.00000001, 128, 1024, L2Norm, true, true, 1234ULL}, + {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; + +typedef ColNormTest ColNormTestF; +TEST_P(ColNormTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef ColNormTest ColNormTestD; +TEST_P(ColNormTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, + ::testing::ValuesIn(inputscf)); + +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, + ::testing::ValuesIn(inputscd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu new file mode 100644 index 0000000000..255cf1a696 --- /dev/null +++ b/cpp/test/linalg/reduce.cu @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" +#include "reduce.cuh" + +namespace raft { +namespace linalg { + +template +struct ReduceInputs { + T tolerance; + int rows, cols; + bool rowMajor, alongRows; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const ReduceInputs &dims) { + return os; +} + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor, + bool alongRows, bool inplace, cudaStream_t stream) { + reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace, + [] __device__(T in, int i) { return in * in; }); +} + +template +class ReduceTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + int len = rows * cols; + outlen = params.alongRows ? rows : cols; + raft::allocate(data, len); + raft::allocate(dots_exp, outlen); + raft::allocate(dots_act, outlen); + r.uniform(data, len, T(-1.0), T(1.0), stream); + naiveReduction(dots_exp, data, cols, rows, params.rowMajor, + params.alongRows, stream); + + // Perform reduction with default inplace = false first + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, + false, stream); + // Add to result with inplace = true next, which shouldn't affect + // in the case of coalescedReduction! + if (!(params.rowMajor ^ params.alongRows)) { + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, + params.alongRows, true, stream); + } + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(dots_exp)); + CUDA_CHECK(cudaFree(dots_act)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + protected: + ReduceInputs params; + T *data, *dots_exp, *dots_act; + int outlen; + cudaStream_t stream; +}; + +const std::vector> inputsf = { + {0.000002f, 1024, 32, true, true, 1234ULL}, + {0.000002f, 1024, 64, true, true, 1234ULL}, + {0.000002f, 1024, 128, true, true, 1234ULL}, + {0.000002f, 1024, 256, true, true, 1234ULL}, + {0.000002f, 1024, 32, true, false, 1234ULL}, + {0.000002f, 1024, 64, true, false, 1234ULL}, + {0.000002f, 1024, 128, true, false, 1234ULL}, + {0.000002f, 1024, 256, true, false, 1234ULL}, + {0.000002f, 1024, 32, false, true, 1234ULL}, + {0.000002f, 1024, 64, false, true, 1234ULL}, + {0.000002f, 1024, 128, false, true, 1234ULL}, + {0.000002f, 1024, 256, false, true, 1234ULL}, + {0.000002f, 1024, 32, false, false, 1234ULL}, + {0.000002f, 1024, 64, false, false, 1234ULL}, + {0.000002f, 1024, 128, false, false, 1234ULL}, + {0.000002f, 1024, 256, false, false, 1234ULL}}; + +const std::vector> inputsd = { + {0.000000001, 1024, 32, true, true, 1234ULL}, + {0.000000001, 1024, 64, true, true, 1234ULL}, + {0.000000001, 1024, 128, true, true, 1234ULL}, + {0.000000001, 1024, 256, true, true, 1234ULL}, + {0.000000001, 1024, 32, true, false, 1234ULL}, + {0.000000001, 1024, 64, true, false, 1234ULL}, + {0.000000001, 1024, 128, true, false, 1234ULL}, + {0.000000001, 1024, 256, true, false, 1234ULL}, + {0.000000001, 1024, 32, false, true, 1234ULL}, + {0.000000001, 1024, 64, false, true, 1234ULL}, + {0.000000001, 1024, 128, false, true, 1234ULL}, + {0.000000001, 1024, 256, false, true, 1234ULL}, + {0.000000001, 1024, 32, false, false, 1234ULL}, + {0.000000001, 1024, 64, false, false, 1234ULL}, + {0.000000001, 1024, 128, false, false, 1234ULL}, + {0.000000001, 1024, 256, false, false, 1234ULL}}; + +typedef ReduceTest ReduceTestF; +TEST_P(ReduceTestF, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); +} + +typedef ReduceTest ReduceTestD; +TEST_P(ReduceTestD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh new file mode 100644 index 0000000000..18261287cf --- /dev/null +++ b/cpp/test/linalg/reduce.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { + +template +__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data, + int D, int N) { + Type acc = (Type)0; + int rowStart = threadIdx.x + blockIdx.x * blockDim.x; + if (rowStart < N) { + for (int i = 0; i < D; ++i) { + acc += data[rowStart * D + i] * data[rowStart * D + i]; + } + dots[rowStart] = 2 * acc; + } +} + +template +void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel + <<>>(dots, data, D, N); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +void unaryAndGemv(Type *dots, const Type *data, int D, int N, + cudaStream_t stream) { + //computes a MLCommon unary op on data (squares it), then computes Ax + //(A input matrix and x column vector) to sum columns + thrust::device_vector sq(D * N); + raft::linalg::unaryOp( + thrust::raw_pointer_cast(sq.data()), data, D * N, + [] __device__(Type v) { return v * v; }, stream); + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); + thrust::device_vector ones(N, 1); //column vector [1...1] + Type alpha = 1, beta = 0; + CUBLAS_CHECK(raft::linalg::cublasgemv( + handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, + thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUDA_CHECK(cudaDeviceSynchronize()); + CUBLAS_CHECK(cublasDestroy(handle)); +} + +template +void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor, + bool alongRows, cudaStream_t stream) { + if (rowMajor && alongRows) { + naiveCoalescedReduction(dots, data, D, N, stream); + } else if (rowMajor && !alongRows) { + unaryAndGemv(dots, data, D, N, stream); + } else if (!rowMajor && alongRows) { + unaryAndGemv(dots, data, N, D, stream); + } else { + naiveCoalescedReduction(dots, data, N, D, stream); + } + CUDA_CHECK(cudaDeviceSynchronize()); +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu new file mode 100644 index 0000000000..b27fa2ac1a --- /dev/null +++ b/cpp/test/linalg/strided_reduction.cu @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "reduce.cuh" + +namespace raft { +namespace linalg { + +template +struct stridedReductionInputs { + T tolerance; + int rows, cols; + unsigned long long int seed; +}; + +template +void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, + cudaStream_t stream) { + stridedReduction(dots, data, cols, rows, (T)0, stream, false, + [] __device__(T in, int i) { return in * in; }); +} + +template +class stridedReductionTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + int len = rows * cols; + + raft::allocate(data, len); + raft::allocate(dots_exp, cols); //expected dot products (from test) + raft::allocate(dots_act, cols); //actual dot products (from prim) + r.uniform(data, len, T(-1.0), T(1.0), + stream); //initialize matrix to random + + unaryAndGemv(dots_exp, data, cols, rows, stream); + stridedReductionLaunch(dots_act, data, cols, rows, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(dots_exp)); + CUDA_CHECK(cudaFree(dots_act)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + protected: + stridedReductionInputs params; + T *data, *dots_exp, *dots_act; + cudaStream_t stream; +}; + +const std::vector> inputsf = { + {0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; + +const std::vector> inputsd = { + {0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; + +typedef stridedReductionTest stridedReductionTestF; +TEST_P(stridedReductionTestF, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef stridedReductionTest stridedReductionTestD; +TEST_P(stridedReductionTestD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu new file mode 100644 index 0000000000..ced3f65fdd --- /dev/null +++ b/cpp/test/linalg/subtract.cu @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, + const Type *in2, int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = in1[idx] - in2[idx]; + } +} + +template +void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveSubtractElemKernel<<>>(out, in1, in2, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, + const Type in2, int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = in1[idx] - in2; + } +} + +template +void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel + <<>>(out, in1, in2, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct SubtractInputs { + T tolerance; + int len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { + return os; +} + +template +class SubtractTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(in1, len); + raft::allocate(in2, len); + raft::allocate(out_ref, len); + raft::allocate(out, len); + r.uniform(in1, len, T(-1.0), T(1.0), stream); + r.uniform(in2, len, T(-1.0), T(1.0), stream); + + naiveSubtractElem(out_ref, in1, in2, len, stream); + naiveSubtractScalar(out_ref, out_ref, T(1), len, stream); + + subtract(out, in1, in2, len, stream); + subtractScalar(out, out, T(1), len, stream); + subtract(in1, in1, in2, len, stream); + subtractScalar(in1, in1, T(1), len, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in1)); + CUDA_CHECK(cudaFree(in2)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + protected: + SubtractInputs params; + T *in1, *in2, *out_ref, *out; +}; + +const std::vector> inputsf2 = { + {0.000001f, 1024 * 1024, 1234ULL}}; + +const std::vector> inputsd2 = { + {0.00000001, 1024 * 1024, 1234ULL}}; + +typedef SubtractTest SubtractTestF; +TEST_P(SubtractTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, + raft::CompareApprox(params.tolerance))); +} + +typedef SubtractTest SubtractTestD; +TEST_P(SubtractTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, + ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu new file mode 100644 index 0000000000..fff321768f --- /dev/null +++ b/cpp/test/linalg/svd.cu @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct SvdInputs { + T tolerance; + int len; + int n_row; + int n_col; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { + return os; +} + +template +class SvdTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + raft::handle_t handle; + + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + cudaStream_t stream = handle.get_stream(); + raft::allocate(data, len); + + ASSERT(params.n_row == 3, "This test only supports nrows=3!"); + ASSERT(params.len == 6, "This test only supports len=6!"); + T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; + raft::update_device(data, data_h, len, stream); + + int left_evl = params.n_row * params.n_col; + int right_evl = params.n_col * params.n_col; + + raft::allocate(left_eig_vectors_qr, left_evl); + raft::allocate(right_eig_vectors_trans_qr, right_evl); + raft::allocate(sing_vals_qr, params.n_col); + + // allocate(left_eig_vectors_jacobi, left_evl); + // allocate(right_eig_vectors_trans_jacobi, right_evl); + // allocate(sing_vals_jacobi, params.n_col); + + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, + 0.488195, 0.110706, -0.865685}; + + T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; + + T sing_vals_ref_h[] = {7.065283, 1.040081}; + + raft::allocate(left_eig_vectors_ref, left_evl); + raft::allocate(right_eig_vectors_ref, right_evl); + raft::allocate(sing_vals_ref, params.n_col); + + raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, + stream); + raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, + right_evl, stream); + raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream); + + svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, + left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, + stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(left_eig_vectors_qr)); + CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); + CUDA_CHECK(cudaFree(sing_vals_qr)); + CUDA_CHECK(cudaFree(left_eig_vectors_ref)); + CUDA_CHECK(cudaFree(right_eig_vectors_ref)); + CUDA_CHECK(cudaFree(sing_vals_ref)); + } + + protected: + SvdInputs params; + T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, + *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; +}; + +const std::vector> inputsf2 = { + {0.00001f, 3 * 2, 3, 2, 1234ULL}}; + +const std::vector> inputsd2 = { + {0.00001, 3 * 2, 3, 2, 1234ULL}}; + +typedef SvdTest SvdTestValF; +TEST_P(SvdTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef SvdTest SvdTestValD; +TEST_P(SvdTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef SvdTest SvdTestLeftVecF; +TEST_P(SvdTestLeftVecF, Result) { + ASSERT_TRUE(raft::devArrMatch( + left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef SvdTest SvdTestLeftVecD; +TEST_P(SvdTestLeftVecD, Result) { + ASSERT_TRUE(raft::devArrMatch( + left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef SvdTest SvdTestRightVecF; +TEST_P(SvdTestRightVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef SvdTest SvdTestRightVecD; +TEST_P(SvdTestRightVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); + +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, + ::testing::ValuesIn(inputsd2)); + +// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, +// ::testing::ValuesIn(inputsf2)); + +// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, +//::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu new file mode 100644 index 0000000000..f10b029962 --- /dev/null +++ b/cpp/test/linalg/transpose.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct TranposeInputs { + T tolerance; + int len; + int n_row; + int n_col; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { + return os; +} + +template +class TransposeTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + + stream = handle.get_stream(); + + int len = params.len; + + raft::allocate(data, len); + ASSERT(params.len == 9, "This test works only with len=9!"); + T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; + raft::update_device(data, data_h, len, stream); + + raft::allocate(data_trans_ref, len); + T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; + raft::update_device(data_trans_ref, data_ref_h, len, stream); + + raft::allocate(data_trans, len); + + transpose(handle, data, data_trans, params.n_row, params.n_col, stream); + transpose(data, params.n_row, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(data_trans)); + CUDA_CHECK(cudaFree(data_trans_ref)); + } + + protected: + TranposeInputs params; + T *data, *data_trans, *data_trans_ref; + raft::handle_t handle; + cudaStream_t stream; +}; + +const std::vector> inputsf2 = { + {0.1f, 3 * 3, 3, 3, 1234ULL}}; + +const std::vector> inputsd2 = { + {0.1, 3 * 3, 3, 3, 1234ULL}}; + +typedef TransposeTest TransposeTestValF; +TEST_P(TransposeTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data_trans, params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef TransposeTest TransposeTestValD; +TEST_P(TransposeTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data_trans, params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data, params.len, + raft::CompareApproxAbs(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, + ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu new file mode 100644 index 0000000000..666ab8619d --- /dev/null +++ b/cpp/test/linalg/unary_op.cu @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" +#include "unary_op.cuh" + +namespace raft { +namespace linalg { + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, + cudaStream_t stream) { + if (in == nullptr) { + auto op = [scalar] __device__(OutType * ptr, IdxType idx) { + *ptr = static_cast(scalar * idx); + }; + writeOnlyUnaryOp(out, len, op, stream); + } else { + auto op = [scalar] __device__(InType in) { + return static_cast(in * scalar); + }; + unaryOp(out, in, len, op, stream); + } +} + +template +class UnaryOpTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam< + UnaryOpInputs>::GetParam(); + raft::random::Rng r(params.seed); + CUDA_CHECK(cudaStreamCreate(&stream)); + auto len = params.len; + allocate(in, len); + allocate(out_ref, len); + allocate(out, len); + r.uniform(in, len, InType(-1.0), InType(1.0), stream); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + } + + virtual void DoTest() { + auto len = params.len; + auto scalar = params.scalar; + naiveScale(out_ref, in, scalar, len, stream); + unaryOpLaunch(out, in, scalar, len, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); + } + + UnaryOpInputs params; + InType *in; + OutType *out_ref, *out; + cudaStream_t stream; +}; + +template +class WriteOnlyUnaryOpTest : public UnaryOpTest { + protected: + void DoTest() override { + auto len = this->params.len; + auto scalar = this->params.scalar; + naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream); + CUDA_CHECK(cudaStreamSynchronize(this->stream)); + ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len, + CompareApprox(this->params.tolerance))); + } +}; + +#define UNARY_OP_TEST(Name, inputs) \ + TEST_P(Name, Result) { DoTest(); } \ + INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) + +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestF_i32; +UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); +typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; +UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i32, inputsf_i32); + +const std::vector> inputsf_i64 = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestF_i64; +UNARY_OP_TEST(UnaryOpTestF_i64, inputsf_i64); +typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i64; +UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i64, inputsf_i64); + +const std::vector> inputsf_i32_d = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestF_i32_D; +UNARY_OP_TEST(UnaryOpTestF_i32_D, inputsf_i32_d); + +const std::vector> inputsd_i32 = { + {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestD_i32; +UNARY_OP_TEST(UnaryOpTestD_i32, inputsd_i32); +typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestD_i32; +UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i32, inputsd_i32); + +const std::vector> inputsd_i64 = { + {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestD_i64; +UNARY_OP_TEST(UnaryOpTestD_i64, inputsd_i64); +typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestD_i64; +UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i64, inputsd_i64); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh new file mode 100644 index 0000000000..be3f1124c5 --- /dev/null +++ b/cpp/test/linalg/unary_op.cuh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, + IdxType len) { + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); + if (idx < len) { + if (in == nullptr) { + // used for testing writeOnlyUnaryOp + out[idx] = static_cast(scalar * idx); + } else { + out[idx] = static_cast(scalar * in[idx]); + } + } +} + +template +void naiveScale(OutType *out, const InType *in, InType scalar, int len, + cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel + <<>>(out, in, scalar, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct UnaryOpInputs { + OutType tolerance; + IdxType len; + InType scalar; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const UnaryOpInputs &d) { + return os; +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu new file mode 100644 index 0000000000..578139623a --- /dev/null +++ b/cpp/test/matrix/math.cu @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace matrix { + +template +__global__ void nativePowerKernel(Type *in, Type *out, int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = in[idx] * in[idx]; + } +} + +template +void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + nativePowerKernel<<>>(in, out, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { + out[idx] = sqrt(in[idx]); + } +} + +template +void naiveSqrt(Type *in, Type *out, int len) { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + nativeSqrtKernel<<>>(in, out, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, + int colCount) { + int d_i = blockIdx.x * rowCount; + int end = d_i + rowCount; + + if (blockIdx.x < colCount) { + Type max = 0.0; + int max_index = 0; + for (int i = d_i; i < end; i++) { + Type val = in[i]; + if (val < 0.0) { + val = -val; + } + if (val > max) { + max = val; + max_index = i; + } + } + + for (int i = d_i; i < end; i++) { + if (in[max_index] < 0.0) { + out[i] = -in[i]; + } else { + out[i] = in[i]; + } + } + } + + __syncthreads(); +} + +template +void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { + naiveSignFlipKernel<<>>(in, out, rowCount, colCount); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +struct MathInputs { + T tolerance; + int n_row; + int n_col; + int len; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { + return os; +} + +template +class MathTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + random::Rng r(params.seed); + int len = params.len; + + allocate(in_power, len); + allocate(out_power_ref, len); + allocate(in_sqrt, len); + allocate(out_sqrt_ref, len); + allocate(in_sign_flip, len); + allocate(out_sign_flip_ref, len); + + raft::handle_t handle; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + allocate(in_ratio, 4); + T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; + update_device(in_ratio, in_ratio_h, 4, stream); + + allocate(out_ratio_ref, 4); + T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375}; + update_device(out_ratio_ref, out_ratio_ref_h, 4, stream); + + r.uniform(in_power, len, T(-1.0), T(1.0), stream); + r.uniform(in_sqrt, len, T(0.0), T(1.0), stream); + // r.uniform(in_ratio, len, T(0.0), T(1.0)); + r.uniform(in_sign_flip, len, T(-100.0), T(100.0), stream); + + naivePower(in_power, out_power_ref, len, stream); + power(in_power, len, stream); + + naiveSqrt(in_sqrt, out_sqrt_ref, len); + seqRoot(in_sqrt, len, stream); + + ratio(handle, in_ratio, in_ratio, 4, stream); + + naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col); + signFlip(in_sign_flip, params.n_row, params.n_col, stream); + + allocate(in_recip, 4); + allocate(in_recip_ref, 4); + allocate(out_recip, 4); + // default threshold is 1e-15 + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; + update_device(in_recip, in_recip_h.data(), 4, stream); + update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream); + T recip_scalar = T(1.0); + + // this `reciprocal()` has to go first bc next one modifies its input + reciprocal(in_recip, out_recip, recip_scalar, 4, stream); + + reciprocal(in_recip, recip_scalar, 4, stream, true); + + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; + allocate(in_smallzero, 4); + allocate(out_smallzero, 4); + allocate(out_smallzero_ref, 4); + update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream); + update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream); + setSmallValuesZero(out_smallzero, in_smallzero, 4, stream); + setSmallValuesZero(in_smallzero, 4, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in_power)); + CUDA_CHECK(cudaFree(out_power_ref)); + CUDA_CHECK(cudaFree(in_sqrt)); + CUDA_CHECK(cudaFree(out_sqrt_ref)); + CUDA_CHECK(cudaFree(in_ratio)); + CUDA_CHECK(cudaFree(out_ratio_ref)); + CUDA_CHECK(cudaFree(in_sign_flip)); + CUDA_CHECK(cudaFree(out_sign_flip_ref)); + CUDA_CHECK(cudaFree(in_recip)); + CUDA_CHECK(cudaFree(in_recip_ref)); + CUDA_CHECK(cudaFree(out_recip)); + CUDA_CHECK(cudaFree(in_smallzero)); + CUDA_CHECK(cudaFree(out_smallzero)); + CUDA_CHECK(cudaFree(out_smallzero_ref)); + } + + protected: + MathInputs params; + T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, + *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, + *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; +}; + +const std::vector> inputsf = { + {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; + +const std::vector> inputsd = { + {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; + +typedef MathTest MathPowerTestF; +TEST_P(MathPowerTestF, Result) { + ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathPowerTestD; +TEST_P(MathPowerTestD, Result) { + ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSqrtTestF; +TEST_P(MathSqrtTestF, Result) { + ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSqrtTestD; +TEST_P(MathSqrtTestD, Result) { + ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathRatioTestF; +TEST_P(MathRatioTestF, Result) { + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathRatioTestD; +TEST_P(MathRatioTestD, Result) { + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSignFlipTestF; +TEST_P(MathSignFlipTestF, Result) { + ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSignFlipTestD; +TEST_P(MathSignFlipTestD, Result) { + ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathReciprocalTestF; +TEST_P(MathReciprocalTestF, Result) { + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, + CompareApprox(params.tolerance))); + + // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathReciprocalTestD; +TEST_P(MathReciprocalTestD, Result) { + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, + CompareApprox(params.tolerance))); + + // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSetSmallZeroTestF; +TEST_P(MathSetSmallZeroTestF, Result) { + ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); + + ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); +} + +typedef MathTest MathSetSmallZeroTestD; +TEST_P(MathSetSmallZeroTestD, Result) { + ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); + + ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, + ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, + ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, + ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, + ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, + ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, + ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, + ::testing::ValuesIn(inputsd)); + +} // namespace matrix +} // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu new file mode 100644 index 0000000000..499d24ed41 --- /dev/null +++ b/cpp/test/matrix/matrix.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace matrix { + +template +struct MatrixInputs { + T tolerance; + int n_row; + int n_col; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { + return os; +} + +template +class MatrixTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.n_row * params.n_col; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(in1, len); + raft::allocate(in2, len); + raft::allocate(in1_revr, len); + r.uniform(in1, len, T(-1.0), T(1.0), stream); + + copy(in1, in2, params.n_row, params.n_col, stream); + // copy(in1, in1_revr, params.n_row, params.n_col); + // colReverse(in1_revr, params.n_row, params.n_col); + + T *outTrunc; + raft::allocate(outTrunc, 6); + truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in1)); + CUDA_CHECK(cudaFree(in2)); + // CUDA_CHECK(cudaFree(in1_revr)); + } + + protected: + MatrixInputs params; + T *in1, *in2, *in1_revr; +}; + +const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; + +const std::vector> inputsd2 = { + {0.00000001, 4, 4, 1234ULL}}; + +typedef MatrixTest MatrixTestF; +TEST_P(MatrixTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, + raft::CompareApprox(params.tolerance))); +} + +typedef MatrixTest MatrixTestD; +TEST_P(MatrixTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, + ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, + ::testing::ValuesIn(inputsd2)); + +} // namespace matrix +} // namespace raft diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu new file mode 100644 index 0000000000..af10dcab30 --- /dev/null +++ b/cpp/test/random/rng.cu @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace random { + +enum RandomType { + RNG_Normal, + RNG_LogNormal, + RNG_Uniform, + RNG_Gumbel, + RNG_Logistic, + RNG_Exp, + RNG_Rayleigh, + RNG_Laplace +}; + +template +__global__ void meanKernel(T* out, const T* data, int len) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); + __syncthreads(); + T xx = BlockReduce(temp_storage).Sum(val * val); + __syncthreads(); + if (threadIdx.x == 0) { + raft::myAtomicAdd(out, x); + raft::myAtomicAdd(out + 1, xx); + } +} + +template +struct RngInputs { + T tolerance; + int len; + // start, end: for uniform + // mean, sigma: for normal/lognormal + // mean, beta: for gumbel + // mean, scale: for logistic and laplace + // lambda: for exponential + // sigma: for rayleigh + T start, end; + RandomType type; + GeneratorType gtype; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { + return os; +} + +#include +#include + +template +class RngTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + // Tests are configured with their expected test-values sigma. For example, + // 4 x sigma indicates the test shouldn't fail 99.9% of the time. + num_sigma = 10; + params = ::testing::TestWithParam>::GetParam(); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + Rng r(params.seed, params.gtype); + allocate(data, params.len); + allocate(stats, 2, true); + switch (params.type) { + case RNG_Normal: + r.normal(data, params.len, params.start, params.end, stream); + break; + case RNG_LogNormal: + r.lognormal(data, params.len, params.start, params.end, stream); + break; + case RNG_Uniform: + r.uniform(data, params.len, params.start, params.end, stream); + break; + case RNG_Gumbel: + r.gumbel(data, params.len, params.start, params.end, stream); + break; + case RNG_Logistic: + r.logistic(data, params.len, params.start, params.end, stream); + break; + case RNG_Exp: + r.exponential(data, params.len, params.start, stream); + break; + case RNG_Rayleigh: + r.rayleigh(data, params.len, params.start, stream); + break; + case RNG_Laplace: + r.laplace(data, params.len, params.start, params.end, stream); + break; + }; + static const int threads = 128; + meanKernel + <<>>(stats, data, + params.len); + update_host(h_stats, stats, 2, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + h_stats[0] /= params.len; + h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(stats)); + } + + void getExpectedMeanVar(T meanvar[2]) { + switch (params.type) { + case RNG_Normal: + meanvar[0] = params.start; + meanvar[1] = params.end * params.end; + break; + case RNG_LogNormal: { + auto var = params.end * params.end; + auto mu = params.start; + meanvar[0] = raft::myExp(mu + var * T(0.5)); + meanvar[1] = + (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + break; + } + case RNG_Uniform: + meanvar[0] = (params.start + params.end) * T(0.5); + meanvar[1] = params.end - params.start; + meanvar[1] = meanvar[1] * meanvar[1] / T(12.0); + break; + case RNG_Gumbel: { + auto gamma = T(0.577215664901532); + meanvar[0] = params.start + params.end * gamma; + meanvar[1] = T(3.1415) * T(3.1415) * params.end * params.end / T(6.0); + break; + } + case RNG_Logistic: + meanvar[0] = params.start; + meanvar[1] = T(3.1415) * T(3.1415) * params.end * params.end / T(3.0); + break; + case RNG_Exp: + meanvar[0] = T(1.0) / params.start; + meanvar[1] = meanvar[0] * meanvar[0]; + break; + case RNG_Rayleigh: + meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); + meanvar[1] = + ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + break; + case RNG_Laplace: + meanvar[0] = params.start; + meanvar[1] = T(2.0) * params.end * params.end; + break; + }; + } + + protected: + RngInputs params; + T *data, *stats; + T h_stats[2]; // mean, var + int num_sigma; +}; + +// The measured mean and standard deviation for each tested distribution are, +// of course, statistical variables. Thus setting an appropriate testing +// tolerance essentially requires one to set a probability of test failure. We +// choose to set this at 3-4 x sigma, i.e., a 99.7-99.9% confidence interval so that +// the test will indeed pass. In quick experiments (using the identical +// distributions given by NumPy/SciPy), the measured standard deviation is the +// variable with the greatest variance and so we determined the variance for +// each distribution and number of samples (32*1024 or 8*1024). Below +// are listed the standard deviation for these tests. + +// Distribution: StdDev 32*1024, StdDev 8*1024 +// Normal: 0.0055, 0.011 +// LogNormal: 0.05, 0.1 +// Uniform: 0.003, 0.005 +// Gumbel: 0.005, 0.01 +// Logistic: 0.005, 0.01 +// Exp: 0.008, 0.015 +// Rayleigh: 0.0125, 0.025 +// Laplace: 0.02, 0.04 + +// We generally want 4 x sigma >= 99.9% chance of success + +typedef RngTest RngTestF; +const std::vector> inputsf = { + {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL}, + {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL}, + {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL}, + {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL}, + {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL}, + {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL}, + {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL}, + {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL}, + {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL}, + {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL}, + {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL}, + {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL}, + + {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL}, + {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL}, + {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL}, + {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL}, + {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL}, + {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL}, + {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL}, + {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL}, + {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL}, + {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL}, + {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL}, + {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL}, + + {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL}, + {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL}, + {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL}, + {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL}, + {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL}, + {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL}, + {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL}, + {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL}, + {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL}, + {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL}, + {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL}, + {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL}, + {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, + {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; + +TEST_P(RngTestF, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); + +typedef RngTest RngTestD; +const std::vector> inputsd = { + {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL}, + {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL}, + {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL}, + {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL}, + {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL}, + {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL}, + {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL}, + {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL}, + {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL}, + {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL}, + {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL}, + {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL}, + + {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL}, + {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL}, + {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL}, + {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL}, + {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL}, + {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL}, + {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL}, + {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL}, + {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL}, + {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL}, + {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL}, + {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL}, + + {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL}, + {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL}, + {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL}, + {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL}, + {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL}, + {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL}, + {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL}, + {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL}, + {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL}, + {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL}, + {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, + {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, + {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, + {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; +TEST_P(RngTestD, Result) { + double meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); + +// ---------------------------------------------------------------------- // +// Test for expected variance in mean calculations + +template +T quick_mean(const std::vector& d) { + T acc = T(0); + for (const auto& di : d) { + acc += di; + } + return acc / d.size(); +} + +template +T quick_std(const std::vector& d) { + T acc = T(0); + T d_mean = quick_mean(d); + for (const auto& di : d) { + acc += ((di - d_mean) * (di - d_mean)); + } + return std::sqrt(acc / (d.size() - 1)); +} + +template +std::ostream& operator<<(std::ostream& out, const std::vector& v) { + if (!v.empty()) { + out << '['; + std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); + out << "\b\b]"; + } + return out; +} + +// The following tests the 3 random number generators by checking that the +// measured mean error is close to the well-known analytical result +// (sigma/sqrt(n_samples)). To compute the mean error, we a number of +// experiments computing the mean, giving us a distribution of the mean +// itself. The mean error is simply the standard deviation of this +// distribution (the standard deviation of the mean). +TEST(Rng, MeanError) { + timeb time_struct; + ftime(&time_struct); + int seed = time_struct.millitm; + int num_samples = 1024; + int num_experiments = 1024; + float* data; + float* mean_result; + float* std_result; + int len = num_samples * num_experiments; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + allocate(data, len); + allocate(mean_result, num_experiments); + allocate(std_result, num_experiments); + + for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) { + Rng r(seed, rtype); + r.normal(data, len, 3.3f, 0.23f, stream); + // r.uniform(data, len, -1.0, 2.0); + raft::stats::mean(mean_result, data, num_samples, num_experiments, false, + false, stream); + raft::stats::stddev(std_result, data, mean_result, num_samples, + num_experiments, false, false, stream); + std::vector h_mean_result(num_experiments); + std::vector h_std_result(num_experiments); + update_host(h_mean_result.data(), mean_result, num_experiments, stream); + update_host(h_std_result.data(), std_result, num_experiments, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + auto d_mean = quick_mean(h_mean_result); + + // std-dev of mean; also known as mean error + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); + auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); + + // std::cout << "measured mean error: " << d_std_of_mean << "\n"; + // std::cout << "expected mean error: " << d_std/std::sqrt(num_samples) << "\n"; + + auto diff_expected_vs_measured_mean_error = + std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); + + ASSERT_TRUE( + (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + } + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(mean_result)); + CUDA_CHECK(cudaFree(std_result)); + + // std::cout << "mean_res:" << h_mean_result << "\n"; +} + +template +class ScaledBernoulliTest : public ::testing::Test { + protected: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + + Rng r(42); + + allocate(data, len * sizeof(T), stream); + r.scaled_bernoulli(data, len, T(0.5), T(scale), stream); + } + + void TearDown() override { CUDA_CHECK(cudaFree(data)); } + + void rangeCheck() { + T* h_data = new T[len]; + update_host(h_data, data, len, stream); + ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { + return a < -scale || a > scale; + })); + delete[] h_data; + } + + T* data; + cudaStream_t stream; +}; + +typedef ScaledBernoulliTest ScaledBernoulliTest1; +TEST_F(ScaledBernoulliTest1, RangeCheck) { rangeCheck(); } + +typedef ScaledBernoulliTest ScaledBernoulliTest2; +TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); } + +template +class BernoulliTest : public ::testing::Test { + protected: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + Rng r(42); + allocate(data, len * sizeof(bool), stream); + r.bernoulli(data, len, T(0.5), stream); + } + + void TearDown() override { CUDA_CHECK(cudaFree(data)); } + + void trueFalseCheck() { + // both true and false values must be present + bool* h_data = new bool[len]; + update_host(h_data, data, len, stream); + ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return a; })); + ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return !a; })); + delete[] h_data; + } + + bool* data; + cudaStream_t stream; +}; + +typedef BernoulliTest BernoulliTest1; +TEST_F(BernoulliTest1, TrueFalseCheck) { trueFalseCheck(); } + +typedef BernoulliTest BernoulliTest2; +TEST_F(BernoulliTest2, TrueFalseCheck) { trueFalseCheck(); } + +/** Rng::normalTable tests */ +template +struct RngNormalTableInputs { + T tolerance; + int rows, cols; + T mu, sigma; + GeneratorType gtype; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, + const RngNormalTableInputs& dims) { + return os; +} + +template +class RngNormalTableTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + // Tests are configured with their expected test-values sigma. For example, + // 4 x sigma indicates the test shouldn't fail 99.9% of the time. + num_sigma = 10; + params = ::testing::TestWithParam>::GetParam(); + int len = params.rows * params.cols; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + Rng r(params.seed, params.gtype); + allocate(data, len); + allocate(stats, 2, true); + allocate(mu_vec, params.cols); + r.fill(mu_vec, params.cols, params.mu, stream); + T* sigma_vec = nullptr; + r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, + params.sigma, stream); + static const int threads = 128; + meanKernel + <<>>(stats, data, len); + update_host(h_stats, stats, 2, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + h_stats[0] /= len; + h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(stats)); + CUDA_CHECK(cudaFree(mu_vec)); + } + + void getExpectedMeanVar(T meanvar[2]) { + meanvar[0] = params.mu; + meanvar[1] = params.sigma * params.sigma; + } + + protected: + RngNormalTableInputs params; + T *data, *stats, *mu_vec; + T h_stats[2]; // mean, var + int num_sigma; +}; + +typedef RngNormalTableTest RngNormalTableTestF; +const std::vector> inputsf_t = { + {0.0055, 32, 1024, 1.f, 1.f, GenPhilox, 1234ULL}, + {0.011, 8, 1024, 1.f, 1.f, GenPhilox, 1234ULL}, + {0.0055, 32, 1024, 1.f, 1.f, GenTaps, 1234ULL}, + {0.011, 8, 1024, 1.f, 1.f, GenTaps, 1234ULL}, + {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, + {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; + +TEST_P(RngNormalTableTestF, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, + ::testing::ValuesIn(inputsf_t)); + +typedef RngNormalTableTest RngNormalTableTestD; +const std::vector> inputsd_t = { + {0.0055, 32, 1024, 1.0, 1.0, GenPhilox, 1234ULL}, + {0.011, 8, 1024, 1.0, 1.0, GenPhilox, 1234ULL}, + {0.0055, 32, 1024, 1.0, 1.0, GenTaps, 1234ULL}, + {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, + {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, + {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; +TEST_P(RngNormalTableTestD, Result) { + double meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, + ::testing::ValuesIn(inputsd_t)); + +struct RngAffineInputs { + int n; + unsigned long long seed; +}; + +class RngAffineTest : public ::testing::TestWithParam { + protected: + void SetUp() override { + params = ::testing::TestWithParam::GetParam(); + Rng r(params.seed); + r.affine_transform_params(params.n, a, b); + } + + void check() { + ASSERT_TRUE(gcd(a, params.n) == 1); + ASSERT_TRUE(0 <= b && b < params.n); + } + + private: + RngAffineInputs params; + int a, b; +}; // RngAffineTest + +const std::vector inputs_affine = { + {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, + {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, + {2568, 123456ULL}, {2568, 1234567890ULL}, +}; +TEST_P(RngAffineTest, Result) { check(); } +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, + ::testing::ValuesIn(inputs_affine)); + +} // namespace random +} // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu new file mode 100644 index 0000000000..92f12206e8 --- /dev/null +++ b/cpp/test/random/rng_int.cu @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace random { + +enum RandomType { RNG_Uniform }; + +template +__global__ void meanKernel(float *out, const T *data, int len) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + float val = tid < len ? data[tid] : T(0); + float x = BlockReduce(temp_storage).Sum(val); + __syncthreads(); + float xx = BlockReduce(temp_storage).Sum(val * val); + __syncthreads(); + if (threadIdx.x == 0) { + raft::myAtomicAdd(out, x); + raft::myAtomicAdd(out + 1, xx); + } +} + +template +struct RngInputs { + float tolerance; + int len; + // start, end: for uniform + // mean, sigma: for normal/lognormal + // mean, beta: for gumbel + // mean, scale: for logistic and laplace + // lambda: for exponential + // sigma: for rayleigh + T start, end; + RandomType type; + GeneratorType gtype; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { + return os; +} + +template +class RngTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + Rng r(params.seed, params.gtype); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + allocate(data, params.len); + allocate(stats, 2, true); + switch (params.type) { + case RNG_Uniform: + r.uniformInt(data, params.len, params.start, params.end, stream); + break; + }; + static const int threads = 128; + meanKernel + <<>>(stats, data, + params.len); + update_host(h_stats, stats, 2, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + h_stats[0] /= params.len; + h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(stats)); + } + + void getExpectedMeanVar(float meanvar[2]) { + switch (params.type) { + case RNG_Uniform: + meanvar[0] = (params.start + params.end) * 0.5f; + meanvar[1] = params.end - params.start; + meanvar[1] = meanvar[1] * meanvar[1] / 12.f; + break; + }; + } + + protected: + RngInputs params; + T *data; + float *stats; + float h_stats[2]; // mean, var +}; + +typedef RngTest RngTestU32; +const std::vector> inputs_u32 = { + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; +TEST_P(RngTestU32, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); + +typedef RngTest RngTestU64; +const std::vector> inputs_u64 = { + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; +TEST_P(RngTestU64, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); + +typedef RngTest RngTestS32; +const std::vector> inputs_s32 = { + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; +TEST_P(RngTestS32, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); + +typedef RngTest RngTestS64; +const std::vector> inputs_s64 = { + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, + {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, + {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; +TEST_P(RngTestS64, Result) { + float meanvar[2]; + getExpectedMeanVar(meanvar); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); + +} // namespace random +} // namespace raft diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu new file mode 100644 index 0000000000..d7e52a8958 --- /dev/null +++ b/cpp/test/random/sample_without_replacement.cu @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace random { + +// Terminology: +// SWoR - Sample Without Replacement + +template +struct SWoRInputs { + int len, sampledLen; + int largeWeightIndex; + T largeWeight; + GeneratorType gtype; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { + return os; +} + +template +class SWoRTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + CUDA_CHECK(cudaStreamCreate(&stream)); + + Rng r(params.seed, params.gtype); + allocate(in, params.len); + allocate(wts, params.len); + allocate(out, params.sampledLen); + allocate(outIdx, params.sampledLen); + h_outIdx.resize(params.sampledLen); + r.uniform(in, params.len, T(-1.0), T(1.0), stream); + r.uniform(wts, params.len, T(1.0), T(2.0), stream); + if (params.largeWeightIndex >= 0) { + update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, + stream); + } + r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, + params.len, stream); + update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(wts)); + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaFree(outIdx)); + } + + protected: + SWoRInputs params; + T *in, *out, *wts; + int* outIdx; + std::vector h_outIdx; + cudaStream_t stream; + raft::handle_t handle; +}; + +typedef SWoRTest SWoRTestF; +const std::vector> inputsf = { + {1024, 512, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 1024, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL}, + {1024, 512, 10, 100000.f, GenPhilox, 1234ULL}, + + {1024, 512, -1, 0.f, GenTaps, 1234ULL}, + {1024, 1024, -1, 0.f, GenTaps, 1234ULL}, + {1024, 512 + 1, -1, 0.f, GenTaps, 1234ULL}, + {1024, 1024 - 1, -1, 0.f, GenTaps, 1234ULL}, + {1024, 512 + 2, -1, 0.f, GenTaps, 1234ULL}, + {1024, 1024 - 2, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 512, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 1024, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 512, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 1024, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.f, GenTaps, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.f, GenTaps, 1234ULL}, + {1024, 512, 10, 100000.f, GenTaps, 1234ULL}, + + {1024, 512, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 1024, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 512 + 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 1024 - 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 512 + 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 512, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 1024, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 512, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 1024, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.f, GenKiss99, 1234ULL}, + {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, +}; + +TEST_P(SWoRTestF, Result) { + std::set occurence; + for (int i = 0; i < params.sampledLen; ++i) { + auto val = h_outIdx[i]; + // indices must be in the given range + ASSERT_TRUE(0 <= val && val < params.len) + << "out-of-range index @i=" << i << " val=" << val + << " sampledLen=" << params.sampledLen; + // indices should not repeat + ASSERT_TRUE(occurence.find(val) == occurence.end()) + << "repeated index @i=" << i << " idx=" << val; + occurence.insert(val); + } + // if there's a skewed distribution, the top index should correspond to the + // particular item with a large weight + if (params.largeWeightIndex >= 0) { + ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); + } +} +INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); + +typedef SWoRTest SWoRTestD; +const std::vector> inputsd = { + {1024, 512, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 1024, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL}, + {1024, 512, 10, 100000.0, GenPhilox, 1234ULL}, + + {1024, 512, -1, 0.0, GenTaps, 1234ULL}, + {1024, 1024, -1, 0.0, GenTaps, 1234ULL}, + {1024, 512 + 1, -1, 0.0, GenTaps, 1234ULL}, + {1024, 1024 - 1, -1, 0.0, GenTaps, 1234ULL}, + {1024, 512 + 2, -1, 0.0, GenTaps, 1234ULL}, + {1024, 1024 - 2, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 512, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 1024, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 512, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 1024, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.0, GenTaps, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.0, GenTaps, 1234ULL}, + {1024, 512, 10, 100000.0, GenTaps, 1234ULL}, + + {1024, 512, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 1024, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 512 + 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 1024 - 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 512 + 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 512, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 1024, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 512 + 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 512 + 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 1, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 512, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 1024, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 512 + 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 512 + 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024 + 2, 1024 + 2, -1, 0.0, GenKiss99, 1234ULL}, + {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, +}; + +TEST_P(SWoRTestD, Result) { + std::set occurence; + for (int i = 0; i < params.sampledLen; ++i) { + auto val = h_outIdx[i]; + // indices must be in the given range + ASSERT_TRUE(0 <= val && val < params.len) + << "out-of-range index @i=" << i << " val=" << val + << " sampledLen=" << params.sampledLen; + // indices should not repeat + ASSERT_TRUE(occurence.find(val) == occurence.end()) + << "repeated index @i=" << i << " idx=" << val; + occurence.insert(val); + } + // if there's a skewed distribution, the top index should correspond to the + // particular item with a large weight + if (params.largeWeightIndex >= 0) { + ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); + } +} +INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); + +} // namespace random +} // namespace raft diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu new file mode 100644 index 0000000000..4a3b0ed196 --- /dev/null +++ b/cpp/test/stats/mean.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace stats { + +template +struct MeanInputs { + T tolerance, mean; + int rows, cols; + bool sample, rowMajor; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { + return os; +} + +template +class MeanTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + + int rows = params.rows, cols = params.cols; + int len = rows * cols; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + allocate(data, len); + allocate(mean_act, cols); + r.normal(data, len, params.mean, (T)1.0, stream); + + meanSGtest(data, stream); + } + + void meanSGtest(T *data, cudaStream_t stream) { + int rows = params.rows, cols = params.cols; + + mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(mean_act)); + } + + protected: + MeanInputs params; + T *data, *mean_act; +}; + +// Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the +// measured mean (of a normal distribution) will fall outside of an epsilon of +// 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) +const std::vector> inputsf = { + {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; + +typedef MeanTest MeanTestF; +TEST_P(MeanTestF, Result) { + ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, + CompareApprox(params.tolerance))); +} + +typedef MeanTest MeanTestD; +TEST_P(MeanTestD, Result) { + ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu new file mode 100644 index 0000000000..8b0d607561 --- /dev/null +++ b/cpp/test/stats/mean_center.cu @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../linalg/matrix_vector_op.cuh" +#include "../test_utils.h" + +namespace raft { +namespace stats { + +template +struct MeanCenterInputs { + T tolerance, mean; + IdxType rows, cols; + bool sample, rowMajor, bcastAlongRows; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const MeanCenterInputs &dims) { + return os; +} + +template +class MeanCenterTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + auto rows = params.rows, cols = params.cols; + auto len = rows * cols; + IdxType vecLen = params.bcastAlongRows ? cols : rows; + + raft::allocate(out, len); + raft::allocate(out_ref, len); + raft::allocate(data, len); + raft::allocate(meanVec, vecLen); + r.normal(data, len, params.mean, (T)1.0, stream); + raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, + stream); + meanCenter(out, data, meanVec, cols, rows, params.rowMajor, + params.bcastAlongRows, stream); + raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, + params.rowMajor, params.bcastAlongRows, (T)-1.0); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(meanVec)); + } + + protected: + MeanCenterInputs params; + T *data, *meanVec, *out, *out_ref; +}; + +const std::vector> inputsf_i32 = { + {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; +typedef MeanCenterTest MeanCenterTestF_i32; +TEST_P(MeanCenterTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, + ::testing::ValuesIn(inputsf_i32)); + +const std::vector> inputsf_i64 = { + {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL}, + {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL}, + {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL}, + {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; +typedef MeanCenterTest MeanCenterTestF_i64; +TEST_P(MeanCenterTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, + ::testing::ValuesIn(inputsf_i64)); + +const std::vector> inputsd_i32 = { + {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, + {0.05, 1.0, 1024, 64, true, false, true, 1234ULL}, + {0.05, 1.0, 1024, 128, true, false, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, true, 1234ULL}, + {0.05, 1.0, 1024, 32, true, true, true, 1234ULL}, + {0.05, 1.0, 1024, 64, true, true, true, 1234ULL}, + {0.05, 1.0, 1024, 128, true, true, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, true, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, true, 1234ULL}, + {0.05, 1.0, 1024, 32, true, false, false, 1234ULL}, + {0.05, 1.0, 1024, 64, true, false, false, 1234ULL}, + {0.05, 1.0, 1024, 128, true, false, false, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, false, 1234ULL}, + {0.05, 1.0, 1024, 32, true, true, false, 1234ULL}, + {0.05, 1.0, 1024, 64, true, true, false, 1234ULL}, + {0.05, 1.0, 1024, 128, true, true, false, 1234ULL}, + {0.05, -1.0, 1024, 32, false, true, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; +typedef MeanCenterTest MeanCenterTestD_i32; +TEST_P(MeanCenterTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, + ::testing::ValuesIn(inputsd_i32)); + +const std::vector> inputsd_i64 = { + {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, + {0.05, 1.0, 1024, 64, true, false, true, 1234ULL}, + {0.05, 1.0, 1024, 128, true, false, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, true, 1234ULL}, + {0.05, 1.0, 1024, 32, true, true, true, 1234ULL}, + {0.05, 1.0, 1024, 64, true, true, true, 1234ULL}, + {0.05, 1.0, 1024, 128, true, true, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, true, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, true, 1234ULL}, + {0.05, 1.0, 1024, 32, true, false, false, 1234ULL}, + {0.05, 1.0, 1024, 64, true, false, false, 1234ULL}, + {0.05, 1.0, 1024, 128, true, false, false, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, false, 1234ULL}, + {0.05, 1.0, 1024, 32, true, true, false, 1234ULL}, + {0.05, 1.0, 1024, 64, true, true, false, 1234ULL}, + {0.05, 1.0, 1024, 128, true, true, false, 1234ULL}, + {0.05, -1.0, 1024, 32, false, true, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; +typedef MeanCenterTest MeanCenterTestD_i64; +TEST_P(MeanCenterTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, + ::testing::ValuesIn(inputsd_i64)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu new file mode 100644 index 0000000000..ff2698788f --- /dev/null +++ b/cpp/test/stats/stddev.cu @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace stats { + +template +struct StdDevInputs { + T tolerance, mean, stddev; + int rows, cols; + bool sample, rowMajor; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { + return os; +} + +template +class StdDevTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + int len = rows * cols; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + allocate(data, len); + allocate(mean_act, cols); + allocate(stddev_act, cols); + allocate(vars_act, cols); + r.normal(data, len, params.mean, params.stddev, stream); + stdVarSGtest(data, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void stdVarSGtest(T *data, cudaStream_t stream) { + int rows = params.rows, cols = params.cols; + + mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); + + stddev(stddev_act, data, mean_act, cols, rows, params.sample, + params.rowMajor, stream); + + vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, + stream); + + raft::matrix::seqRoot(vars_act, T(1), cols, stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(mean_act)); + CUDA_CHECK(cudaFree(stddev_act)); + CUDA_CHECK(cudaFree(vars_act)); + } + + protected: + StdDevInputs params; + T *data, *mean_act, *stddev_act, *vars_act; +}; + +const std::vector> inputsf = { + {0.1f, 1.f, 2.f, 1024, 32, true, false, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 64, true, false, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 128, true, false, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 256, true, false, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 32, false, false, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 64, false, false, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 128, false, false, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 256, false, false, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 32, true, true, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 64, true, true, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 128, true, true, 1234ULL}, + {0.1f, 1.f, 2.f, 1024, 256, true, true, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 32, false, true, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 64, false, true, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 128, false, true, 1234ULL}, + {0.1f, -1.f, 2.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 64, true, false, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 128, true, false, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 256, true, false, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 32, false, false, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 64, false, false, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 128, false, false, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 256, false, false, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 32, true, true, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 64, true, true, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 128, true, true, 1234ULL}, + {0.1, 1.0, 2.0, 1024, 256, true, true, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 32, false, true, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 64, false, true, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 128, false, true, 1234ULL}, + {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; + +typedef StdDevTest StdDevTestF; +TEST_P(StdDevTestF, Result) { + ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, + CompareApprox(params.tolerance))); + + ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, + CompareApprox(params.tolerance))); +} + +typedef StdDevTest StdDevTestD; +TEST_P(StdDevTestD, Result) { + ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, + CompareApprox(params.tolerance))); + + ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu new file mode 100644 index 0000000000..c3140d4588 --- /dev/null +++ b/cpp/test/stats/sum.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace stats { + +template +struct SumInputs { + T tolerance; + int rows, cols; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { + return os; +} + +template +class SumTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + int rows = params.rows, cols = params.cols; + int len = rows * cols; + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(data, len); + + T data_h[len]; + for (int i = 0; i < len; i++) { + data_h[i] = T(1); + } + + raft::update_device(data, data_h, len, stream); + + raft::allocate(sum_act, cols); + sum(sum_act, data, cols, rows, false, stream); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(sum_act)); + } + + protected: + SumInputs params; + T *data, *sum_act; +}; + +const std::vector> inputsf = {{0.05f, 1024, 32, 1234ULL}, + {0.05f, 1024, 256, 1234ULL}}; + +const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, + {0.05, 1024, 256, 1234ULL}}; + +typedef SumTest SumTestF; +TEST_P(SumTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef SumTest SumTestD; +TEST_P(SumTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h new file mode 100644 index 0000000000..1629e8aa34 --- /dev/null +++ b/cpp/test/test_utils.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include + +namespace raft { + +template +struct Compare { + bool operator()(const T &a, const T &b) const { return a == b; } +}; + +template +struct CompareApprox { + CompareApprox(T eps_) : eps(eps_) {} + bool operator()(const T &a, const T &b) const { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); + T ratio = diff >= eps ? diff / m : diff; + + return (ratio <= eps); + } + + private: + T eps; +}; + +template +struct CompareApproxAbs { + CompareApproxAbs(T eps_) : eps(eps_) {} + bool operator()(const T &a, const T &b) const { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); + T ratio = diff >= eps ? diff / m : diff; + return (ratio <= eps); + } + + private: + T eps; +}; + +template +T abs(const T &a) { + return a > T(0) ? a : -a; +} + +/* + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ +template +testing::AssertionResult devArrMatch(const T *expected, const T *actual, + size_t size, L eq_compare, + cudaStream_t stream = 0) { + std::shared_ptr exp_h(new T[size]); + std::shared_ptr act_h(new T[size]); + raft::update_host(exp_h.get(), expected, size, stream); + raft::update_host(act_h.get(), actual, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for (size_t i(0); i < size; ++i) { + auto exp = exp_h.get()[i]; + auto act = act_h.get()[i]; + if (!eq_compare(exp, act)) { + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << exp << " @" << i; + } + } + return testing::AssertionSuccess(); +} + +template +testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, + L eq_compare, cudaStream_t stream = 0) { + std::shared_ptr act_h(new T[size]); + raft::update_host(act_h.get(), actual, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for (size_t i(0); i < size; ++i) { + auto act = act_h.get()[i]; + if (!eq_compare(expected, act)) { + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << expected << " @" << i; + } + } + return testing::AssertionSuccess(); +} + +template +testing::AssertionResult devArrMatch(const T *expected, const T *actual, + size_t rows, size_t cols, L eq_compare, + cudaStream_t stream = 0) { + size_t size = rows * cols; + std::shared_ptr exp_h(new T[size]); + std::shared_ptr act_h(new T[size]); + raft::update_host(exp_h.get(), expected, size, stream); + raft::update_host(act_h.get(), actual, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for (size_t i(0); i < rows; ++i) { + for (size_t j(0); j < cols; ++j) { + auto idx = i * cols + j; // row major assumption! + auto exp = exp_h.get()[idx]; + auto act = act_h.get()[idx]; + if (!eq_compare(exp, act)) { + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << exp << " @" << i << "," + << j; + } + } + } + return testing::AssertionSuccess(); +} + +template +testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, + size_t cols, L eq_compare, + cudaStream_t stream = 0) { + size_t size = rows * cols; + std::shared_ptr act_h(new T[size]); + raft::update_host(act_h.get(), actual, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for (size_t i(0); i < rows; ++i) { + for (size_t j(0); j < cols; ++j) { + auto idx = i * cols + j; // row major assumption! + auto act = act_h.get()[idx]; + if (!eq_compare(expected, act)) { + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << expected << " @" << i + << "," << j; + } + } + } + return testing::AssertionSuccess(); +} + +/* + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ +template +testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, + size_t size, L eq_compare, + cudaStream_t stream = 0) { + std::shared_ptr act_h(new T[size]); + raft::update_host(act_h.get(), actual_d, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + bool ok = true; + auto fail = testing::AssertionFailure(); + for (size_t i(0); i < size; ++i) { + auto exp = expected_h[i]; + auto act = act_h.get()[i]; + if (!eq_compare(exp, act)) { + ok = false; + fail << "actual=" << act << " != expected=" << exp << " @" << i << "; "; + } + } + if (!ok) return fail; + return testing::AssertionSuccess(); +} + +/* + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ +template +testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, + size_t cols, L eq_compare, + cudaStream_t stream = 0) { + size_t size = rows * cols; + std::shared_ptr act_h(new T[size]); + raft::update_host(act_h.get(), actual, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for (size_t i(0); i < rows; ++i) { + for (size_t j(0); j < cols; ++j) { + if (i != j) continue; + auto idx = i * cols + j; // row major assumption! + auto act = act_h.get()[idx]; + if (!eq_compare(expected, act)) { + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << expected << " @" << i + << "," << j; + } + } + } + return testing::AssertionSuccess(); +} + +template +testing::AssertionResult match(const T expected, T actual, L eq_compare) { + if (!eq_compare(expected, actual)) { + return testing::AssertionFailure() + << "actual=" << actual << " != expected=" << expected; + } + return testing::AssertionSuccess(); +} + +/** @} */ + +/** time the function call 'func' using cuda events */ +#define TIMEIT_LOOP(ms, count, func) \ + do { \ + cudaEvent_t start, stop; \ + CUDA_CHECK(cudaEventCreate(&start)); \ + CUDA_CHECK(cudaEventCreate(&stop)); \ + CUDA_CHECK(cudaEventRecord(start)); \ + for (int i = 0; i < count; ++i) { \ + func; \ + } \ + CUDA_CHECK(cudaEventRecord(stop)); \ + CUDA_CHECK(cudaEventSynchronize(stop)); \ + ms = 0.f; \ + CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); \ + ms /= args.runs; \ + } while (0) + +}; // end namespace raft