diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index f380d276b2..08f836d3a8 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -21,7 +21,8 @@ namespace raft { /* Function for testing RAFT include * * @return message indicating RAFT has been included succesfully*/ -inline std::string test_raft() { +inline std::string test_raft() +{ std::string status = "RAFT Setup succesfully"; return status; } diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index ce8ef9a095..f63040fa00 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -42,17 +42,15 @@ namespace cache { * @param [out] out vectors collected from the cache, size [n_vec * n] */ template -__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, - int n, math_t *out) { +__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx + size_t out_col = tid / n_vec; // col idx size_t cache_col = cache_idx[out_col]; if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { - out[tid] = cache[row + cache_col * n_vec]; - } + if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } } } } @@ -84,21 +82,26 @@ __global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, * @param [in] n_cache_vecs */ template -__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec, - const int *tile_idx, int n, const int *cache_idx, - math_t *cache, int n_cache_vecs) { +__global__ void store_vecs(const math_t* tile, + int n_tile, + int n_vec, + const int* tile_idx, + int n, + const int* cache_idx, + math_t* cache, + int n_cache_vecs) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; + int tile_col = tid / n_vec; // col idx + int data_col = tile_idx ? tile_idx[tile_col] : tile_col; int cache_col = cache_idx[tile_col]; // We ignore negative values. The rest of the checks should be fulfilled // if the cache is used properly if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = - tile[row + (size_t)data_col * n_vec]; + cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; } } } @@ -121,14 +124,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } * @return the index of the first element in the array for which * array[idx] >= value. If there is no such value, then return n. */ -int DI arg_first_ge(const int *array, int n, int val) { +int DI arg_first_ge(const int* array, int n, int val) +{ int start = 0; - int end = n - 1; + int end = n - 1; if (array[0] == val) return 0; if (array[end] < val) return n; while (start + 1 < end) { int q = (start + end + 1) / 2; - //invariants: + // invariants: // start < end // start < q <=end // array[start] < val && array[end] <=val @@ -157,7 +161,8 @@ int DI arg_first_ge(const int *array, int n, int val) { * @return the idx of the k-th occurance of val in array, or -1 if * the value is not found. */ -int DI find_nth_occurrence(const int *array, int n, int val, int k) { +int DI find_nth_occurrence(const int* array, int n, int val, int k) +{ int q = arg_first_ge(array, n, val); if (q + k < n && array[q + k] == val) { q += k; @@ -196,10 +201,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) { * Each block should give a different pointer for rank. */ template -DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { +DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) +{ const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort - BlockRadixSort; + typedef cub::BlockRadixSort BlockRadixSort; __shared__ typename BlockRadixSort::TempStorage temp_storage; int key[items_per_thread]; @@ -208,8 +213,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { int block_offset = blockIdx.x * associativity; for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; + int k = threadIdx.x + j * nthreads; + int t = (k < associativity) ? cache_time[block_offset + k] : 32768; key[j] = t; val[j] = k; } @@ -217,9 +222,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { BlockRadixSort(temp_storage).Sort(key, val); for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { - rank[val[j]] = threadIdx.x * items_per_thread + j; - } + if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } } __syncthreads(); } @@ -252,9 +255,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { * not be cached, size [n] */ template -__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, - int *cached_keys, int n_cache_sets, - int *cache_time, int time, int *cache_idx) { +__global__ void assign_cache_idx(const int* keys, + int n, + const int* cache_set, + int* cached_keys, + int n_cache_sets, + int* cache_time, + int time, + int* cache_idx) +{ int block_offset = blockIdx.x * associativity; const int items_per_thread = raft::ceildiv(associativity, nthreads); @@ -273,7 +282,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, // these elements are assigned -1. for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; + int i = threadIdx.x + j * nthreads; int t_idx = block_offset + i; bool mask = (i < associativity); // whether this slot is available for writing @@ -284,10 +293,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, if (mask) { int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); if (k > -1) { - int key_val = keys[k]; + int key_val = keys[k]; cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; + cache_idx[k] = t_idx; + cache_time[t_idx] = time; } } } @@ -315,21 +324,28 @@ namespace { * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] * @param [in] n_cache_sets number of cache sets * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity] + * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * + * associativity] * @param [out] cache_idx cache indices of the working set elements, size [n] * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ void get_cache_idx(int *keys, int n, int *cached_keys, - int n_cache_sets, int associativity, - int *cache_time, int *cache_idx, bool *is_cached, - int time) { +__global__ void get_cache_idx(int* keys, + int n, + int* cached_keys, + int n_cache_sets, + int associativity, + int* cache_time, + int* cache_idx, + bool* is_cached, + int time) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; + int widx = keys[tid]; + int sidx = hash(widx, n_cache_sets); + int cidx = sidx * associativity; + int i = 0; bool found = false; // search for empty spot and the least recently used spot while (i < associativity && !found) { @@ -338,9 +354,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys, } is_cached[tid] = found; if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; //update time stamp - cache_idx[tid] = cidx; //exact cache idx + cidx = cidx + i - 1; + cache_time[cidx] = time; // update time stamp + cache_idx[tid] = cidx; // exact cache idx } else { cache_idx[tid] = sidx; // assign cache set } diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8d5b29f700..4767c7f254 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -22,28 +22,32 @@ namespace raft { /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream - */ + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ template -void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, - KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, - cudaStream_t stream) { +void sortPairs(raft::mr::device::buffer& workspace, + const KeyT* inKeys, + KeyT* outKeys, + const ValueT* inVals, + ValueT* outVals, + int len, + cudaStream_t stream) +{ size_t worksize; - cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, - outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs( + nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, - inVals, outVals, len, 0, sizeof(KeyT) * 8, - stream); + cub::DeviceRadixSort::SortPairs( + workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); } } // namespace raft diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index bb2b019ecb..41dc9cab08 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -31,40 +31,43 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) { +DI void sts(float* addr, const float& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); } -DI void sts(float* addr, const float (&x)[1]) { +DI void sts(float* addr, const float (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); } -DI void sts(float* addr, const float (&x)[2]) { +DI void sts(float* addr, const float (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" - : - : "l"(s2), "f"(x[0]), "f"(x[1])); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); } -DI void sts(float* addr, const float (&x)[4]) { +DI void sts(float* addr, const float (&x)[4]) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" : : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); } -DI void sts(double* addr, const double& x) { +DI void sts(double* addr, const double& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); } -DI void sts(double* addr, const double (&x)[1]) { +DI void sts(double* addr, const double (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double (&x)[2]) { +DI void sts(double* addr, const double (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" - : - : "l"(s2), "d"(x[0]), "d"(x[1])); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) { * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ -DI void lds(float& x, float* addr) { +DI void lds(float& x, float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); } -DI void lds(float (&x)[1], float* addr) { +DI void lds(float (&x)[1], float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); } -DI void lds(float (&x)[2], float* addr) { +DI void lds(float (&x)[2], float* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); } -DI void lds(float (&x)[4], float* addr) { +DI void lds(float (&x)[4], float* addr) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(s4)); } -DI void lds(double& x, double* addr) { +DI void lds(double& x, double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); } -DI void lds(double (&x)[1], double* addr) { +DI void lds(double (&x)[1], double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); } -DI void lds(double (&x)[2], double* addr) { +DI void lds(double (&x)[2], double* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); } /** @} */ @@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) { * @param[out] x data to be loaded from global memory * @param[in] addr address in global memory from where to load */ -DI void ldg(float& x, const float* addr) { +DI void ldg(float& x, const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); } -DI void ldg(float (&x)[1], const float* addr) { +DI void ldg(float (&x)[1], const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); } -DI void ldg(float (&x)[2], const float* addr) { - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(addr)); +DI void ldg(float (&x)[2], const float* addr) +{ + asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); } -DI void ldg(float (&x)[4], const float* addr) { +DI void ldg(float (&x)[4], const float* addr) +{ asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } -DI void ldg(double& x, const double* addr) { +DI void ldg(double& x, const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); } -DI void ldg(double (&x)[1], const double* addr) { +DI void ldg(double (&x)[1], const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); } -DI void ldg(double (&x)[2], const double* addr) { - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(addr)); +DI void ldg(double (&x)[2], const double* addr) +{ + asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); } /** @} */ diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index 785794461e..b228ac5499 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -22,8 +22,8 @@ namespace raft { template -__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, - IdxT len, Lambda op) { +__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +{ typedef TxN_t DataVec; typedef TxN_t IdxVec; IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); @@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, DataVec dataIn; #pragma unroll for (int i = 0; i < VecLen; ++i) { - auto inPos = idxIn.val.data[i]; + auto inPos = idxIn.val.data[i]; dataIn.val.data[i] = op(in[inPos], tid + i); } dataIn.store(out, tid); } template -void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - Lambda op, cudaStream_t stream) { +void scatterImpl( + DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream) +{ const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); - scatterKernel - <<>>(out, in, idx, len, op); + scatterKernel<<>>(out, in, idx, len, op); CUDA_CHECK(cudaGetLastError()); } /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. - */ -template , int TPB = 256> -void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - cudaStream_t stream, Lambda op = raft::Nop()) { + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT* out, + const DataT* in, + const IdxT* idx, + IdxT len, + cudaStream_t stream, + Lambda op = raft::Nop()) +{ if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; + size_t bytes = len * MaxPerElem; if (16 / MaxPerElem && bytes % 16 == 0) { - scatterImpl(out, in, idx, len, - op, stream); + scatterImpl(out, in, idx, len, op, stream); } else if (8 / MaxPerElem && bytes % 8 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (4 / MaxPerElem && bytes % 4 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (2 / MaxPerElem && bytes % 2 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (1 / MaxPerElem) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else { scatterImpl(out, in, idx, len, op, stream); } diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index dc172c9503..72c3b3897e 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -25,16 +25,7 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { - CHAR, - UINT8, - INT32, - UINT32, - INT64, - UINT64, - FLOAT32, - FLOAT64 -}; +enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -50,42 +41,50 @@ template constexpr datatype_t get_type(); template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::CHAR; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT8; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT64; } @@ -95,72 +94,99 @@ class comms_iface { virtual int get_rank() const = 0; virtual std::unique_ptr comm_split(int color, int key) const = 0; - virtual void barrier() const = 0; + virtual void barrier() const = 0; virtual status_t sync_stream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const = 0; + virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; - virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, + virtual void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; - virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + virtual void bcast( + void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, + virtual void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, cudaStream_t stream) const = 0; + virtual void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const = 0; - virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, + virtual void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const = 0; + + virtual void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, + virtual void gatherv(const void* sendbuf, + void* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reducescatter(const void* sendbuff, void* recvbuff, - size_t recvcount, datatype_t datatype, op_t op, + virtual void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const = 0; + virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const = 0; - - virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, + virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0; + + virtual void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, cudaStream_t stream) const = 0; - virtual void device_multicast_sendrecv( - const void* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - void* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const = 0; + virtual void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const = 0; }; class comms_t { public: - comms_t(std::unique_ptr impl) : impl_(impl.release()) { + comms_t(std::unique_ptr impl) : impl_(impl.release()) + { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } @@ -187,7 +213,8 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { return impl_->comm_split(color, key); } @@ -204,9 +231,7 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t sync_stream(cudaStream_t stream) const { - return impl_->sync_stream(stream); - } + status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); } /** * Performs an asynchronous point-to-point send @@ -219,10 +244,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void isend(const value_t* buf, size_t size, int dest, int tag, - request_t* request) const { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, - tag, request); + void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const + { + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } /** @@ -236,10 +260,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void irecv(value_t* buf, size_t size, int source, int tag, - request_t* request) const { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, - request); + void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const + { + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } /** @@ -247,7 +270,8 @@ class comms_t { * @param count number of requests to synchronize on * @param array_of_requests an array of request_t objects returned from isend/irecv */ - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { impl_->waitall(count, array_of_requests); } @@ -261,11 +285,15 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, - op_t op, cudaStream_t stream) const { + void allreduce( + const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const + { impl_->allreduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), - op, stream); + static_cast(recvbuff), + count, + get_type(), + op, + stream); } /** @@ -277,9 +305,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - impl_->bcast(static_cast(buff), count, get_type(), root, - stream); + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const + { + impl_->bcast(static_cast(buff), count, get_type(), root, stream); } /** @@ -293,11 +321,20 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, - int root, cudaStream_t stream) const { + void reduce(const value_t* sendbuff, + value_t* recvbuff, + size_t count, + op_t op, + int root, + cudaStream_t stream) const + { impl_->reduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), op, - root, stream); + static_cast(recvbuff), + count, + get_type(), + op, + root, + stream); } /** @@ -309,11 +346,16 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - cudaStream_t stream) const { + void allgather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + cudaStream_t stream) const + { impl_->allgather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, - get_type(), stream); + static_cast(recvbuff), + sendcount, + get_type(), + stream); } /** @@ -328,12 +370,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgatherv(const value_t* sendbuf, value_t* recvbuf, - const size_t* recvcounts, const size_t* displs, - cudaStream_t stream) const { + void allgatherv(const value_t* sendbuf, + value_t* recvbuf, + const size_t* recvcounts, + const size_t* displs, + cudaStream_t stream) const + { impl_->allgatherv(static_cast(sendbuf), - static_cast(recvbuf), recvcounts, displs, - get_type(), stream); + static_cast(recvbuf), + recvcounts, + displs, + get_type(), + stream); } /** @@ -346,11 +394,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - int root, cudaStream_t stream) const { + void gather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + int root, + cudaStream_t stream) const + { impl_->gather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, get_type(), - root, stream); + static_cast(recvbuff), + sendcount, + get_type(), + root, + stream); } /** @@ -367,12 +422,22 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, int root, - cudaStream_t stream) const { + void gatherv(const value_t* sendbuf, + value_t* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + int root, + cudaStream_t stream) const + { impl_->gatherv(static_cast(sendbuf), - static_cast(recvbuf), sendcount, recvcounts, displs, - get_type(), root, stream); + static_cast(recvbuf), + sendcount, + recvcounts, + displs, + get_type(), + root, + stream); } /** @@ -384,11 +449,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reducescatter(const value_t* sendbuff, value_t* recvbuff, - size_t recvcount, op_t op, cudaStream_t stream) const { + void reducescatter(const value_t* sendbuff, + value_t* recvbuff, + size_t recvcount, + op_t op, + cudaStream_t stream) const + { impl_->reducescatter(static_cast(sendbuff), - static_cast(recvbuff), recvcount, - get_type(), op, stream); + static_cast(recvbuff), + recvcount, + get_type(), + op, + stream); } /** @@ -403,10 +475,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_send(const value_t* buf, size_t size, int dest, - cudaStream_t stream) const { - impl_->device_send(static_cast(buf), size * sizeof(value_t), - dest, stream); + void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const + { + impl_->device_send(static_cast(buf), size * sizeof(value_t), dest, stream); } /** @@ -421,10 +492,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_recv(value_t* buf, size_t size, int source, - cudaStream_t stream) const { - impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, - stream); + void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const + { + impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, stream); } /** @@ -440,12 +510,21 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest, - value_t* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { - impl_->device_sendrecv( - static_cast(sendbuf), sendsize * sizeof(value_t), dest, - static_cast(recvbuf), recvsize * sizeof(value_t), source, stream); + void device_sendrecv(const value_t* sendbuf, + size_t sendsize, + int dest, + value_t* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { + impl_->device_sendrecv(static_cast(sendbuf), + sendsize * sizeof(value_t), + dest, + static_cast(recvbuf), + recvsize * sizeof(value_t), + source, + stream); } /** @@ -463,28 +542,37 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_multicast_sendrecv( - const value_t* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - value_t* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { - auto sendbytesizes = sendsizes; + void device_multicast_sendrecv(const value_t* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + value_t* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { + auto sendbytesizes = sendsizes; auto sendbyteoffsets = sendoffsets; for (size_t i = 0; i < sendsizes.size(); ++i) { sendbytesizes[i] *= sizeof(value_t); sendbyteoffsets[i] *= sizeof(value_t); } - auto recvbytesizes = recvsizes; + auto recvbytesizes = recvsizes; auto recvbyteoffsets = recvoffsets; for (size_t i = 0; i < recvsizes.size(); ++i) { recvbytesizes[i] *= sizeof(value_t); recvbyteoffsets[i] *= sizeof(value_t); } impl_->device_multicast_sendrecv(static_cast(sendbuf), - sendbytesizes, sendbyteoffsets, dests, - static_cast(recvbuf), recvbytesizes, - recvbyteoffsets, sources, stream); + sendbytesizes, + sendbyteoffsets, + dests, + static_cast(recvbuf), + recvbytesizes, + recvbyteoffsets, + sources, + stream); } private: diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 7b24e31bbe..93e31b4d6a 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -36,9 +36,9 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, - int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); +void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank) +{ + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( @@ -61,40 +61,41 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, - void *ucp_worker, void *eps, int num_ranks, - int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); +void build_comms_nccl_ucx( + handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank) +{ + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; + ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } } - auto d_alloc = handle->get_device_allocator(); + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, - num_ranks, rank, d_alloc, stream))); + auto communicator = + std::make_shared(std::unique_ptr(new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream))); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, - int size) { +inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size) +{ memcpy(id->internal, uniqueId, size); } -inline void get_unique_id(char *uid, int size) { +inline void get_unique_id(char* uid, int size) +{ ncclUniqueId id; ncclGetUniqueId(&id); diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 8dda74f0a9..65f38b2625 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,16 +32,16 @@ #include #include -#define MPI_TRY(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ - #call, mpi_error_string); \ - } \ +#define MPI_TRY(call) \ + do { \ + int status = call; \ + if (MPI_SUCCESS != status) { \ + int mpi_error_string_lenght = 0; \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ + RAFT_EXPECTS( \ + MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \ + } \ } while (0) #define MPI_TRY_NO_THROW(call) \ @@ -51,48 +51,41 @@ int mpi_error_string_lenght = 0; \ char mpi_error_string[MPI_MAX_ERROR_STRING]; \ MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - printf("MPI call='%s' at file=%s line=%d failed with %s ", #call, \ - __FILE__, __LINE__, mpi_error_string); \ + printf("MPI call='%s' at file=%s line=%d failed with %s ", \ + #call, \ + __FILE__, \ + __LINE__, \ + mpi_error_string); \ } \ } while (0) namespace raft { namespace comms { -constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) { +constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return MPI_CHAR; - case datatype_t::UINT8: - return MPI_UNSIGNED_CHAR; - case datatype_t::INT32: - return MPI_INT; - case datatype_t::UINT32: - return MPI_UNSIGNED; - case datatype_t::INT64: - return MPI_LONG_LONG; - case datatype_t::UINT64: - return MPI_UNSIGNED_LONG_LONG; - case datatype_t::FLOAT32: - return MPI_FLOAT; - case datatype_t::FLOAT64: - return MPI_DOUBLE; + case datatype_t::CHAR: return MPI_CHAR; + case datatype_t::UINT8: return MPI_UNSIGNED_CHAR; + case datatype_t::INT32: return MPI_INT; + case datatype_t::UINT32: return MPI_UNSIGNED; + case datatype_t::INT64: return MPI_LONG_LONG; + case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG; + case datatype_t::FLOAT32: return MPI_FLOAT; + case datatype_t::FLOAT64: return MPI_DOUBLE; default: // Execution should never reach here. This takes care of compiler warning. return MPI_DOUBLE; } } -constexpr MPI_Op get_mpi_op(const op_t op) { +constexpr MPI_Op get_mpi_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return MPI_SUM; - case op_t::PROD: - return MPI_PROD; - case op_t::MIN: - return MPI_MIN; - case op_t::MAX: - return MPI_MAX; + case op_t::SUM: return MPI_SUM; + case op_t::PROD: return MPI_PROD; + case op_t::MIN: return MPI_MIN; + case op_t::MAX: return MPI_MAX; default: // Execution should never reach here. This takes care of compiler warning. return MPI_MAX; @@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) { class mpi_comms : public comms_iface { public: mpi_comms(MPI_Comm comm, const bool owns_mpi_comm) - : owns_mpi_comm_(owns_mpi_comm), - mpi_comm_(comm), - size_(0), - rank_(1), - next_request_id_(0) { + : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) + { int mpi_is_initialized = 0; MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); - //get NCCL unique ID at rank 0 and broadcast it to all others + // get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); - //initializing NCCL + // initializing NCCL NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } - virtual ~mpi_comms() { - //finalizing NCCL + virtual ~mpi_comms() + { + // finalizing NCCL NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { - MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); - } + if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } } int get_size() const { return size_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { MPI_Comm new_comm; MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); @@ -141,15 +131,15 @@ class mpi_comms : public comms_iface { void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } - void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const { + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); @@ -157,15 +147,15 @@ class mpi_comms : public comms_iface { *request = req_id; } - void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const { + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } @@ -174,7 +164,8 @@ class mpi_comms : public comms_iface { *request = req_id; } - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { std::vector requests; requests.reserve(count); for (int i = 0; i < count; ++i) { @@ -189,87 +180,138 @@ class mpi_comms : public comms_iface { MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } - void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts, - const size_t* displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + - displs[root] * get_datatype_size(datatype), - recvcounts[root], get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY( + ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], get_nccl_datatype(datatype), r, - nccl_comm_, stream)); + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -302,45 +344,58 @@ class mpi_comms : public comms_iface { }; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, std::vector const& sendsizes, std::vector const& sendoffsets, - std::vector const& dests, void* recvbuf, + std::vector const& dests, + void* recvbuf, std::vector const& recvsizes, std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -358,9 +413,10 @@ class mpi_comms : public comms_iface { mutable std::unordered_set free_requests_; }; -inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) { - auto communicator = std::make_shared( - std::unique_ptr(new mpi_comms(comm, true))); +inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) +{ + auto communicator = + std::make_shared(std::unique_ptr(new mpi_comms(comm, true))); handle->set_comms(communicator); }; diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 765e8741bb..5f80328d3f 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -62,10 +62,14 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int num_ranks, int rank, + std_comms(ncclComm_t nccl_comm, + ucp_worker_h ucp_worker, + std::shared_ptr eps, + int num_ranks, + int rank, const std::shared_ptr device_allocator, - cudaStream_t stream, bool subcomms_ucp = true) + cudaStream_t stream, + bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), num_ranks_(num_ranks), @@ -74,7 +78,8 @@ class std_comms : public comms_iface { ucp_worker_(ucp_worker), ucp_eps_(eps), next_request_id_(0), - device_allocator_(device_allocator) { + device_allocator_(device_allocator) + { initialize(); }; @@ -84,7 +89,9 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, + std_comms(const ncclComm_t nccl_comm, + int num_ranks, + int rank, const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), @@ -92,37 +99,37 @@ class std_comms : public comms_iface { num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(false), - device_allocator_(device_allocator) { + device_allocator_(device_allocator) + { initialize(); }; - virtual ~std_comms() { + virtual ~std_comms() + { device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); } - void initialize() { - sendbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); + void initialize() + { + sendbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); + recvbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); } int get_size() const { return num_ranks_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { mr::device::buffer d_colors(device_allocator_, stream_, get_size()); mr::device::buffer d_keys(device_allocator_, stream_, get_size()); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); - allgather(d_colors.data() + get_rank(), d_colors.data(), 1, - datatype_t::INT32, stream_); - allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, - stream_); + allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_); + allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_); this->sync_stream(stream_); std::vector h_colors(get_size()); @@ -139,9 +146,7 @@ class std_comms : public comms_iface { for (int i = 0; i < get_size(); ++i) { if (h_colors[i] == color) { subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { - new_ucx_ptrs.push_back((*ucp_eps_)[i]); - } + if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } } } @@ -150,8 +155,7 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, - requests.data() + (i - 1)); + isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); } waitall(requests.size(), requests.data()); } else { @@ -166,17 +170,23 @@ class std_comms : public comms_iface { NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms( - nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, - device_allocator_, stream_, subcomms_ucp_)); + auto eps_sp = std::make_shared(new_ucx_ptrs.data()); + return std::unique_ptr(new std_comms(nccl_comm, + (ucp_worker_h)ucp_worker_, + eps_sp, + subcomm_ranks.size(), + key, + device_allocator_, + stream_, + subcomms_ucp_)); } else { - return std::unique_ptr(new std_comms( - nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr( + new std_comms(nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); } } - void barrier() const { + void barrier() const + { CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); @@ -186,39 +196,37 @@ class std_comms : public comms_iface { "ERROR: syncStream failed. This can be caused by a failed rank_."); } - void get_request_id(request_t *req) const { + void get_request_id(request_t* req) const + { request_t req_id; if (this->free_requests_.empty()) req_id = this->next_request_id_++; else { auto it = this->free_requests_.begin(); - req_id = *it; + req_id = *it; this->free_requests_.erase(it); } *req = req_id; } - void isend(const void *buf, size_t size, int dest, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); - this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, get_rank()); + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void *buf, size_t size, int source, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); @@ -226,18 +234,17 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, - tag_mask, source); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, request_t array_of_requests[]) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void waitall(int count, request_t array_of_requests[]) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); - std::vector requests; + std::vector requests; requests.reserve(count); time_t start = time(NULL); @@ -245,7 +252,8 @@ class std_comms : public comms_iface { for (int i = 0; i < count; ++i) { auto req_it = requests_in_flight_.find(array_of_requests[i]); ASSERT(requests_in_flight_.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); + "ERROR: waitall on invalid request: %d", + array_of_requests[i]); requests.push_back(req_it->second); free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); @@ -258,8 +266,7 @@ class std_comms : public comms_iface { // in 10 or more seconds. ASSERT(now - start < 10, "Timed out waiting for requests."); - for (std::vector::iterator it = requests.begin(); - it != requests.end();) { + for (std::vector::iterator it = requests.begin(); it != requests.end();) { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue @@ -272,10 +279,8 @@ class std_comms : public comms_iface { // If the message needs release, we know it will be sent/received // asynchronously, so we will need to track and verify its state if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), - "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", - UCS_PTR_STATUS(req->req)); + ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req)); ASSERT(req->req->completed == 1 || req->req->completed == 0, "request->completed not a valid value: %d\n", req->req->completed); @@ -296,94 +301,143 @@ class std_comms : public comms_iface { ++it; } // if any progress was made, reset the timeout start time - if (restart) { - start = time(NULL); - } + if (restart) { start = time(NULL); } } } } - void allreduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void *buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts, - const size_t *displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast( - sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, - const size_t *recvcounts, const size_t *displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -416,45 +470,58 @@ class std_comms : public comms_iface { } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void *buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void *buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, - void *recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void device_multicast_sendrecv(const void *sendbuf, - std::vector const &sendsizes, - std::vector const &sendoffsets, - std::vector const &dests, void *recvbuf, - std::vector const &recvsizes, - std::vector const &recvoffsets, - std::vector const &sources, - cudaStream_t stream) const { + void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -473,10 +540,9 @@ class std_comms : public comms_iface { comms_ucp_handler ucp_handler_; ucp_worker_h ucp_worker_; - std::shared_ptr ucp_eps_; + std::shared_ptr ucp_eps_; mutable request_t next_request_id_; - mutable std::unordered_map - requests_in_flight_; + mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; std::shared_ptr device_allocator_; diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 4e95c4eef0..86827a294e 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -37,8 +37,9 @@ namespace comms { * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_allreduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = 1; @@ -46,14 +47,12 @@ bool test_collective_allreduce(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK( - cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK( - cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -69,8 +68,9 @@ bool test_collective_allreduce(const handle_t &handle, int root) { * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_broadcast(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_broadcast(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -80,14 +80,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) { temp_d.resize(1, stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -97,8 +95,9 @@ bool test_collective_broadcast(const handle_t &handle, int root) { return temp_h == root; } -bool test_collective_reduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -107,14 +106,12 @@ bool test_collective_reduce(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -127,8 +124,9 @@ bool test_collective_reduce(const handle_t &handle, int root) { return true; } -bool test_collective_allgather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allgather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -137,19 +135,16 @@ bool test_collective_allgather(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.get_size()); + raft::mr::device::buffer recv_d( + handle.get_device_allocator(), stream, communicator.get_size()); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); - int - temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), - sizeof(int) * communicator.get_size(), - cudaMemcpyDeviceToHost, stream)); + int temp_h[communicator.get_size()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync( + &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -162,8 +157,9 @@ bool test_collective_allgather(const handle_t &handle, int root) { return true; } -bool test_collective_gather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -173,20 +169,19 @@ bool test_collective_gather(const handle_t &handle, int root) { temp_d.resize(1, stream); raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, + handle.get_device_allocator(), + stream, communicator.get_rank() == root ? communicator.get_size() : 0); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), - sizeof(int) * temp_h.size(), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -196,46 +191,48 @@ bool test_collective_gather(const handle_t &handle, int root) { return true; } -bool test_collective_gatherv(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gatherv(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sendcounts(communicator.get_size()); std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); std::vector displacements(communicator.get_size() + 1, 0); - std::partial_sum(sendcounts.begin(), sendcounts.end(), - displacements.begin() + 1); + std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1); - std::vector sends(displacements[communicator.get_rank() + 1] - - displacements[communicator.get_rank()], - communicator.get_rank()); + std::vector sends( + displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()], + communicator.get_rank()); cudaStream_t stream = handle.get_stream(); raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(sends.size(), stream); - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? displacements.back() : 0); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), + stream, + communicator.get_rank() == root ? displacements.back() : 0); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gatherv( - temp_d.data(), recv_d.data(), temp_d.size(), - communicator.get_rank() == root ? sendcounts.data() - : static_cast(nullptr), - communicator.get_rank() == root ? displacements.data() - : static_cast(nullptr), - root, stream); + temp_d.data(), + recv_d.data(), + temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() : static_cast(nullptr), + root, + stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), + recv_d.data(), sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, stream)); + cudaMemcpyDeviceToHost, + stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -249,28 +246,24 @@ bool test_collective_gatherv(const handle_t &handle, int root) { return true; } -bool test_collective_reducescatter(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reducescatter(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sends(communicator.get_size(), 1); cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - 1); + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, sends.size()); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, - stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -287,9 +280,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { * initialized comms instance. * @param number of iterations of all-to-all messaging to perform */ -bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); +bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -298,11 +292,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { std::vector requests; requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; - //post receives + // post receives for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, - requests.data() + request_idx); + communicator.irecv( + received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); ++request_idx; } } @@ -338,8 +332,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.barrier(); } - if (communicator.get_rank() == 0) - std::cout << "=========================" << std::endl; + if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; } return ret; @@ -352,10 +345,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -378,13 +372,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { communicator.sync_stream(stream); - if (!sender && received_data.value(stream) != rank - 1) { - ret = false; - } + if (!sender && received_data.value(stream) != rank - 1) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -397,10 +387,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -414,12 +405,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { if (rank % 2 == 0) { if (rank + 1 < communicator.get_size()) { - communicator.device_sendrecv(sent_data.data(), 1, rank + 1, - received_data.data(), 1, rank + 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream); } } else { - communicator.device_sendrecv(sent_data.data(), 1, rank - 1, - received_data.data(), 1, rank - 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream); } communicator.sync_stream(stream); @@ -429,9 +420,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -444,11 +433,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, - int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -471,25 +460,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, std::vector srcs(communicator.get_size()); std::iota(srcs.begin(), srcs.end(), int{0}); - communicator.device_multicast_sendrecv( - sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(), - recvsizes, recvoffsets, srcs, stream); + communicator.device_multicast_sendrecv(sent_data.data(), + sendsizes, + sendoffsets, + dests, + received_data.data(), + recvsizes, + recvoffsets, + srcs, + stream); communicator.sync_stream(stream); std::vector h_received_data(communicator.get_size()); - raft::update_host(h_received_data.data(), received_data.data(), - received_data.size(), stream); + raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream); CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); ++i) { - if (h_received_data[i] != i) { - ret = false; - } + if (h_received_data[i] != i) { ret = false; } } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -502,20 +492,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, * initialized comms instance. * @param n_colors number of different colors to test */ -bool test_commsplit(const handle_t &h, int n_colors) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - int const size = communicator.get_size(); +bool test_commsplit(const handle_t& h, int n_colors) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + int const size = communicator.get_size(); if (n_colors > size) n_colors = size; // first we need to assign to a color, then assign the rank within the color int color = rank % n_colors; - int key = rank / n_colors; + int key = rank / n_colors; handle_t new_handle(1); - auto shared_comm = - std::make_shared(communicator.comm_split(color, key)); + auto shared_comm = std::make_shared(communicator.comm_split(color, key)); new_handle.set_comms(shared_comm); return test_collective_allreduce(new_handle, 0); diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 226b6f0527..89c7b25630 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -25,16 +25,19 @@ namespace raft { namespace comms { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); -typedef void (*dlsym_rec_free)(void *); +typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); +typedef void (*dlsym_rec_free)(void*); typedef int (*dlsym_worker_progress)(ucp_worker_h); -typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, - ucp_datatype_t, ucp_tag_t, - ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, - ucp_datatype_t datatype, ucp_tag_t, - ucp_tag_t, ucp_tag_recv_callback_t); +typedef ucs_status_ptr_t (*dlsym_send)( + ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, + void*, + size_t count, + ucp_datatype_t datatype, + ucp_tag_t, + ucp_tag_t, + ucp_tag_recv_callback_t); /** * Standard UCX request object that will be passed @@ -55,9 +58,9 @@ struct ucx_context { */ class ucp_request { public: - struct ucx_context *req; - bool needs_release = true; - int other_rank = -1; + struct ucx_context* req; + bool needs_release = true; + int other_rank = -1; bool is_send_request = false; }; @@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1; /** * @brief Asynchronous send callback sets request to completed */ -static void send_callback(void *request, ucs_status_t status) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void send_callback(void* request, ucs_status_t status) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** * @brief Asynchronous recv callback sets request to completed */ -static void recv_callback(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** @@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status, */ class comms_ucp_handler { public: - comms_ucp_handler() { + comms_ucp_handler() + { load_ucp_handle(); load_send_func(); load_recv_func(); @@ -99,7 +104,7 @@ class comms_ucp_handler { ~comms_ucp_handler() { dlclose(ucp_handle); } private: - void *ucp_handle; + void* ucp_handle; dlsym_print_info print_info_func; dlsym_rec_free req_free_func; @@ -107,7 +112,8 @@ class comms_ucp_handler { dlsym_send send_func; dlsym_recv recv_func; - void load_ucp_handle() { + void load_ucp_handle() + { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); if (!ucp_handle) { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); @@ -117,51 +123,56 @@ class comms_ucp_handler { dlerror(); } - void assert_dlerror() { - char *error = dlerror(); + void assert_dlerror() + { + char* error = dlerror(); ASSERT(error == NULL, "Error loading function symbol: %s\n", error); } - void load_send_func() { + void load_send_func() + { send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); assert_dlerror(); } - void load_free_req_func() { + void load_free_req_func() + { req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); assert_dlerror(); } - void load_print_info_func() { + void load_print_info_func() + { print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); assert_dlerror(); } - void load_worker_progress_func() { - worker_progress_func = - (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + void load_worker_progress_func() + { + worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); assert_dlerror(); } - void load_recv_func() { + void load_recv_func() + { recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); assert_dlerror(); } - ucp_tag_t build_message_tag(int rank, int tag) const { + ucp_tag_t build_message_tag(int rank, int tag) const + { // keeping the rank in the lower bits enables debugging. return ((uint32_t)tag << 31) | (uint32_t)rank; } public: - int ucp_progress(ucp_worker_h worker) const { - return (*(worker_progress_func))(worker); - } + int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } /** * @brief Frees any memory underlying the given ucp request object */ - void free_ucp_request(ucp_request *request) const { + void free_ucp_request(ucp_request* request) const + { if (request->needs_release) { request->req->completed = 0; (*(req_free_func))(request->req); @@ -172,56 +183,67 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, - size_t size, int tag, ucp_tag_t tag_mask, int rank) const { + void ucp_isend(ucp_request* req, + ucp_ep_h ep_ptr, + const void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int rank) const + { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - ucs_status_ptr_t send_result = (*(send_func))( - ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context *ucp_req = (struct ucx_context *)send_result; + ucs_status_ptr_t send_result = + (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context* ucp_req = (struct ucx_context*)send_result; if (UCS_PTR_IS_ERR(send_result)) { ASSERT(!UCS_PTR_IS_ERR(send_result), "unable to send UCX data message (%d)\n", UCS_PTR_STATUS(send_result)); /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ req->needs_release = true; } else { req->needs_release = false; } - req->other_rank = rank; + req->other_rank = rank; req->is_send_request = true; - req->req = ucp_req; + req->req = ucp_req; } /** * @brief Asynchronously receive data from given endpoint with the given tag. */ - void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, size_t size, int tag, ucp_tag_t tag_mask, - int sender_rank) const { + void ucp_irecv(ucp_request* req, + ucp_worker_h worker, + ucp_ep_h ep_ptr, + void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int sender_rank) const + { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, - tag_mask, recv_callback); + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); - struct ucx_context *ucp_req = (struct ucx_context *)recv_result; + struct ucx_context* ucp_req = (struct ucx_context*)recv_result; - req->req = ucp_req; - req->needs_release = true; + req->req = ucp_req; + req->needs_release = true; req->is_send_request = false; - req->other_rank = sender_rank; + req->other_rank = sender_rank; ASSERT(!UCS_PTR_IS_ERR(recv_result), "unable to receive UCX data message (%d)\n", diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index f3216abc37..1b0548fc00 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,88 +26,70 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, ncclGetErrorString(status)); \ - throw raft::logic_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + ncclGetErrorString(status)); \ + throw raft::logic_error(msg); \ + } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (ncclSuccess != status) { \ - printf("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ +#define NCCL_TRY_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \ + } \ } while (0) namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { +constexpr size_t get_datatype_size(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return sizeof(char); - case datatype_t::UINT8: - return sizeof(uint8_t); - case datatype_t::INT32: - return sizeof(int); - case datatype_t::UINT32: - return sizeof(unsigned int); - case datatype_t::INT64: - return sizeof(int64_t); - case datatype_t::UINT64: - return sizeof(uint64_t); - case datatype_t::FLOAT32: - return sizeof(float); - case datatype_t::FLOAT64: - return sizeof(double); - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return sizeof(char); + case datatype_t::UINT8: return sizeof(uint8_t); + case datatype_t::INT32: return sizeof(int); + case datatype_t::UINT32: return sizeof(unsigned int); + case datatype_t::INT64: return sizeof(int64_t); + case datatype_t::UINT64: return sizeof(uint64_t); + case datatype_t::FLOAT32: return sizeof(float); + case datatype_t::FLOAT64: return sizeof(double); + default: throw "Unsupported datatype"; } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return ncclChar; - case datatype_t::UINT8: - return ncclUint8; - case datatype_t::INT32: - return ncclInt; - case datatype_t::UINT32: - return ncclUint32; - case datatype_t::INT64: - return ncclInt64; - case datatype_t::UINT64: - return ncclUint64; - case datatype_t::FLOAT32: - return ncclFloat; - case datatype_t::FLOAT64: - return ncclDouble; - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return ncclChar; + case datatype_t::UINT8: return ncclUint8; + case datatype_t::INT32: return ncclInt; + case datatype_t::UINT32: return ncclUint32; + case datatype_t::INT64: return ncclInt64; + case datatype_t::UINT64: return ncclUint64; + case datatype_t::FLOAT32: return ncclFloat; + case datatype_t::FLOAT64: return ncclDouble; + default: throw "Unsupported datatype"; } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return ncclSum; - case op_t::PROD: - return ncclProd; - case op_t::MIN: - return ncclMin; - case op_t::MAX: - return ncclMax; - default: - throw "Unsupported datatype"; + case op_t::SUM: return ncclSum; + case op_t::PROD: return ncclProd; + case op_t::MIN: return ncclMin; + case op_t::MAX: return ncclMax; + default: throw "Unsupported datatype"; } } }; // namespace comms diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 14274043f5..8a66eff242 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -36,16 +36,17 @@ namespace raft { /** helper macro for device inlined functions */ -#define DI inline __device__ +#define DI inline __device__ #define HDI inline __host__ __device__ -#define HD __host__ __device__ +#define HD __host__ __device__ /** * @brief Provide a ceiling division operation ie. ceil(a / b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType ceildiv(IntType a, IntType b) { +constexpr HDI IntType ceildiv(IntType a, IntType b) +{ return (a + b - 1) / b; } @@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignTo(IntType a, IntType b) { +constexpr HDI IntType alignTo(IntType a, IntType b) +{ return ceildiv(a, b) * b; } @@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignDown(IntType a, IntType b) { +constexpr HDI IntType alignDown(IntType a, IntType b) +{ return (a / b) * b; } @@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI bool isPo2(IntType num) { +constexpr HDI bool isPo2(IntType num) +{ return (num && !(num & (num - 1))); } @@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) +{ return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); } /** Device function to apply the input lambda across threads in the grid */ template -DI void forEach(int num, L lambda) { - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; +DI void forEach(int num, L lambda) +{ + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; #pragma unroll for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { @@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) { static const int WarpSize = 32; /** get the laneId of the current thread */ -DI int laneId() { +DI int laneId() +{ int id; asm("mov.s32 %0, %laneid;" : "=r"(id)); return id; @@ -113,15 +120,17 @@ DI int laneId() { * @param b second input */ template -HDI void swapVals(T &a, T &b) { +HDI void swapVals(T& a, T& b) +{ T tmp = a; - a = b; - b = tmp; + a = b; + b = tmp; } /** Device function to have atomic add support for older archs */ template -DI void myAtomicAdd(Type *address, Type val) { +DI void myAtomicAdd(Type* address, Type val) +{ atomicAdd(address, val); } @@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) { // Ref: // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf template <> -DI void myAtomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + old = + atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); } #endif template -DI void myAtomicReduce(T *address, T val, ReduceLambda op); +DI void myAtomicReduce(T* address, T val, ReduceLambda op); template -DI void myAtomicReduce(double *address, double val, ReduceLambda op) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicReduce(double* address, double val, ReduceLambda op) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, - __double_as_longlong(op(val, __longlong_as_double(assumed)))); + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(float *address, float val, ReduceLambda op) { - unsigned int *address_as_uint = (unsigned int *)address; - unsigned int old = *address_as_uint, assumed; +DI void myAtomicReduce(float* address, float val, ReduceLambda op) +{ + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, - __float_as_uint(op(val, __uint_as_float(assumed)))); + old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(int *address, int val, ReduceLambda op) { +DI void myAtomicReduce(int* address, int val, ReduceLambda op) +{ int old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { +DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) +{ long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(unsigned long long *address, unsigned long long val, - ReduceLambda op) { +DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) +{ unsigned long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val); +DI T myAtomicMin(T* address, T val); /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val); +DI T myAtomicMax(T* address, T val); -DI float myAtomicMin(float *address, float val) { +DI float myAtomicMin(float* address, float val) +{ myAtomicReduce(address, val, fminf); return *address; } -DI float myAtomicMax(float *address, float val) { +DI float myAtomicMax(float* address, float val) +{ myAtomicReduce(address, val, fmaxf); return *address; } -DI double myAtomicMin(double *address, double val) { +DI double myAtomicMin(double* address, double val) +{ myAtomicReduce(address, val, fmin); return *address; } -DI double myAtomicMax(double *address, double val) { +DI double myAtomicMax(double* address, double val) +{ myAtomicReduce(address, val, fmax); return *address; } @@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) { template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) { +HDI float myMax(float x, float y) +{ return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) { +HDI double myMax(double x, double y) +{ return fmax(x, y); } /** @} */ @@ -255,11 +275,13 @@ HDI double myMax(double x, double y) { template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) { +HDI float myMin(float x, float y) +{ return fminf(x, y); } template <> -HDI double myMin(double x, double y) { +HDI double myMin(double x, double y) +{ return fmin(x, y); } /** @} */ @@ -267,11 +289,13 @@ HDI double myMin(double x, double y) { /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val) { +DI T myAtomicMin(T* address, T val) +{ myAtomicReduce(address, val, myMin); return *address; } @@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) { /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val) { +DI T myAtomicMax(T* address, T val) +{ myAtomicReduce(address, val, myMax); return *address; } @@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) { * Sign function */ template -HDI int sgn(const T val) { +HDI int sgn(const T val) +{ return (T(0) < val) - (val < T(0)); } @@ -303,11 +330,13 @@ HDI int sgn(const T val) { template HDI T myExp(T x); template <> -HDI float myExp(float x) { +HDI float myExp(float x) +{ return expf(x); } template <> -HDI double myExp(double x) { +HDI double myExp(double x) +{ return exp(x); } /** @} */ @@ -319,11 +348,13 @@ HDI double myExp(double x) { template inline __device__ T myInf(); template <> -inline __device__ float myInf() { +inline __device__ float myInf() +{ return CUDART_INF_F; } template <> -inline __device__ double myInf() { +inline __device__ double myInf() +{ return CUDART_INF; } /** @} */ @@ -335,11 +366,13 @@ inline __device__ double myInf() { template HDI T myLog(T x); template <> -HDI float myLog(float x) { +HDI float myLog(float x) +{ return logf(x); } template <> -HDI double myLog(double x) { +HDI double myLog(double x) +{ return log(x); } /** @} */ @@ -351,11 +384,13 @@ HDI double myLog(double x) { template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) { +HDI float mySqrt(float x) +{ return sqrtf(x); } template <> -HDI double mySqrt(double x) { +HDI double mySqrt(double x) +{ return sqrt(x); } /** @} */ @@ -365,13 +400,15 @@ HDI double mySqrt(double x) { * @{ */ template -DI void mySinCos(T x, T &s, T &c); +DI void mySinCos(T x, T& s, T& c); template <> -DI void mySinCos(float x, float &s, float &c) { +DI void mySinCos(float x, float& s, float& c) +{ sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double &s, double &c) { +DI void mySinCos(double x, double& s, double& c) +{ sincos(x, &s, &c); } /** @} */ @@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) { template DI T mySin(T x); template <> -DI float mySin(float x) { +DI float mySin(float x) +{ return sinf(x); } template <> -DI double mySin(double x) { +DI double mySin(double x) +{ return sin(x); } /** @} */ @@ -397,15 +436,18 @@ DI double mySin(double x) { * @{ */ template -DI T myAbs(T x) { +DI T myAbs(T x) +{ return x < 0 ? -x : x; } template <> -DI float myAbs(float x) { +DI float myAbs(float x) +{ return fabsf(x); } template <> -DI double myAbs(double x) { +DI double myAbs(double x) +{ return fabs(x); } /** @} */ @@ -417,11 +459,13 @@ DI double myAbs(double x) { template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) { +HDI float myPow(float x, float power) +{ return powf(x, power); } template <> -HDI double myPow(double x, double power) { +HDI double myPow(double x, double power) +{ return pow(x, power); } /** @} */ @@ -433,11 +477,13 @@ HDI double myPow(double x, double power) { template HDI T myTanh(T x); template <> -HDI float myTanh(float x) { +HDI float myTanh(float x) +{ return tanhf(x); } template <> -HDI double myTanh(double x) { +HDI double myTanh(double x) +{ return tanh(x); } /** @} */ @@ -449,11 +495,13 @@ HDI double myTanh(double x) { template HDI T myATanh(T x); template <> -HDI float myATanh(float x) { +HDI float myATanh(float x) +{ return atanhf(x); } template <> -HDI double myATanh(double x) { +HDI double myATanh(double x) +{ return atanh(x); } /** @} */ @@ -492,15 +540,18 @@ struct Sum { * @{ */ template -DI T signPrim(T x) { +DI T signPrim(T x) +{ return x < 0 ? -1 : +1; } template <> -DI float signPrim(float x) { +DI float signPrim(float x) +{ return signbit(x) == true ? -1.0f : +1.0f; } template <> -DI double signPrim(double x) { +DI double signPrim(double x) +{ return signbit(x) == true ? -1.0 : +1.0; } /** @} */ @@ -514,28 +565,33 @@ DI double signPrim(double x) { * @{ */ template -DI T maxPrim(T x, T y) { +DI T maxPrim(T x, T y) +{ return x > y ? x : y; } template <> -DI float maxPrim(float x, float y) { +DI float maxPrim(float x, float y) +{ return fmaxf(x, y); } template <> -DI double maxPrim(double x, double y) { +DI double maxPrim(double x, double y) +{ return fmax(x, y); } /** @} */ /** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() { +DI void warpFence() +{ #if __CUDA_ARCH__ >= 700 __syncwarp(); #endif } /** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __any_sync(mask, inFlag); #else @@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { } /** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __all_sync(mask, inFlag); #else @@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { * @return the shuffled data */ template -DI T shfl(T val, int srcLane, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_sync(mask, val, srcLane, width); #else @@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, * @return the shuffled data */ template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, val, laneMask, width); #else @@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, * @todo Expand this to support arbitrary reduction ops */ template -DI T warpReduce(T val) { +DI T warpReduce(T val) +{ #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { T tmp = shfl(val, laneId() + i); @@ -623,12 +681,13 @@ DI T warpReduce(T val) { * @todo Expand this to support arbitrary reduction ops */ template -DI T blockReduce(T val, char *smem) { - auto *sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); +DI T blockReduce(T val, char* smem) +{ + auto* sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); if (lid == 0) sTemp[wid] = val; __syncthreads(); val = lid < nWarps ? sTemp[lid] : T(0); @@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) { * @param idx the index for which to query the stream */ inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t *int_streams, int n_int_streams, - int idx) { + cudaStream_t* int_streams, + int n_int_streams, + int idx) +{ return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; } diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 86c60addf2..872dab7d82 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -49,17 +49,20 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ - cudaGetErrorName(status), cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "CUDA error encountered at: ", \ + "call='%s', Reason=%s:%s", \ + #call, \ + cudaGetErrorName(status), \ + cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ } while (0) /** @@ -89,13 +92,16 @@ struct cuda_error : public raft::exception { // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ - __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(status)); \ + } \ } while (0) namespace raft { @@ -103,9 +109,7 @@ namespace raft { /** Helper method to get to know warp size in device code */ __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; -} +__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping @@ -124,13 +128,16 @@ class grid_1d_thread_t { * @param elements_per_thread Typically, a single kernel thread processes more than a single * element; this affects the number of threads the grid must contain */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) + grid_1d_thread_t(size_t overall_num_elements, + size_t num_threads_per_block, + size_t max_num_blocks_1d, + size_t elements_per_thread = 1) : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { + num_blocks( + std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -153,13 +160,14 @@ class grid_1d_warp_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + grid_1d_warp_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { + num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -181,10 +189,12 @@ class grid_1d_block_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + grid_1d_block_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -200,9 +210,9 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) +{ + CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -213,23 +223,22 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { */ /** performs a host to device copy */ template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - cudaStream_t stream) { +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, cudaStream_t stream) +{ copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - cudaStream_t stream) { +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, cudaStream_t stream) +{ copy(h_ptr, d_ptr, len, stream); } template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - cudaStream_t stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t stream) +{ + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -238,8 +247,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, * @{ */ template -void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { +void print_host_vector(const char* variable_name, + const T* host_mem, + size_t componentsCount, + OutStream& out) +{ out << variable_name << "=["; for (size_t i = 0; i < componentsCount; ++i) { if (i != 0) out << ","; @@ -249,11 +261,13 @@ void print_host_vector(const char* variable_name, const T* host_mem, } template -void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { +void print_device_vector(const char* variable_name, + const T* devMem, + size_t componentsCount, + OutStream& out) +{ T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); print_host_vector(variable_name, host_mem, componentsCount, out); delete[] host_mem; } @@ -261,35 +275,36 @@ void print_device_vector(const char* variable_name, const T* devMem, /** cuda malloc */ template -void allocate(Type*& ptr, size_t len, bool setZero = false) { +void allocate(Type*& ptr, size_t len, bool setZero = false) +{ CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() { +inline int getSharedMemPerBlock() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() { +inline int getMultiProcessorCount() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } /** helper method to convert an array on device to a string on host */ template -std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { +std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) +{ std::stringstream ss; T* arr_h = (T*)malloc(size * sizeof(T)); @@ -311,53 +326,54 @@ std::string arr2Str(const T* arr, int size, std::string name, /** this seems to be unused, but may be useful in the future */ template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) { +void ASSERT_DEVICE_MEM(T* ptr, std::string name) +{ cudaPointerAttributes s_att; cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; + std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device + << ", err=" << s_err << std::endl; } -inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); +inline uint32_t curTimeMillis() +{ + auto now = std::chrono::high_resolution_clock::now(); auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); + return std::chrono::duration_cast(duration).count(); } /** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; -} + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } /** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ template -bool is_aligned(Type* ptr, size_t alignment) { +bool is_aligned(Type* ptr, size_t alignment) +{ return reinterpret_cast(ptr) % alignment == 0; } /** calculate greatest common divisor of two numbers -* @a integer -* @b integer -* @ return gcd of a and b -*/ + * @a integer + * @b integer + * @ return gcd of a and b + */ template -IntType gcd(IntType a, IntType b) { +IntType gcd(IntType a, IntType b) +{ while (b != 0) { IntType tmp = b; - b = a % b; - a = tmp; + b = a % b; + a = tmp; } return a; } diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index dc8093ca1d..e113ca92eb 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -39,9 +39,9 @@ namespace detail { /* @brief binary `sum` operator */ struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs + rhs; } }; @@ -49,7 +49,8 @@ struct DeviceSum { /* @brief binary `min` operator */ struct DeviceMin { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs < rhs ? lhs : rhs; } }; @@ -57,43 +58,44 @@ struct DeviceMin { /* @brief binary `max` operator */ struct DeviceMax { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs > rhs ? lhs : rhs; } }; /* @brief binary `product` operator */ struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs * rhs; } }; /* @brief binary `and` operator */ struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs & rhs); } }; /* @brief binary `or` operator */ struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs | rhs); } }; /* @brief binary `xor` operator */ struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs ^ rhs); } }; @@ -103,9 +105,9 @@ struct DeviceXor { #define errmsg_cast "size mismatch." template -__forceinline__ __device__ T_output type_reinterpret(T_input value) { - static_assert(sizeof(T_output) == sizeof(T_input), - "type_reinterpret for different size"); +__forceinline__ __device__ T_output type_reinterpret(T_input value) +{ + static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); return *(reinterpret_cast(&value)); } @@ -118,25 +120,22 @@ struct genericAtomicOperationImpl; // single byte atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = - type_reinterpret(op(target_value, update_value)); - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = T((old >> shift) & 0xff); + uint8_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return T((old >> shift) & 0xff); @@ -146,26 +145,24 @@ struct genericAtomicOperationImpl { // 2 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { - using T_int = unsigned int; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { + using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = - type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) - ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + + T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -176,15 +173,15 @@ struct genericAtomicOperationImpl { // 4 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; T old_value = *addr; T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -201,8 +198,8 @@ struct genericAtomicOperationImpl { // 8 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -210,7 +207,7 @@ struct genericAtomicOperationImpl { T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -226,8 +223,8 @@ struct genericAtomicOperationImpl { // ------------------------------------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.) -// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int +// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is +// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int // CUDA natively supports `unsigned long long int` for `atomicAdd`, @@ -240,12 +237,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -253,12 +249,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -273,12 +268,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -286,12 +280,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMin op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -299,48 +292,44 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceAnd op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceOr op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceXor op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -353,13 +342,12 @@ struct typesAtomicCASImpl; template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -370,15 +358,14 @@ struct typesAtomicCASImpl { uint8_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = T((old >> shift) & 0xff); // have to compare `target_value` and `compare` before calling atomicCAS // the `target_value` in `old` can be different with `compare` if (target_value != compare) break; - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -387,13 +374,13 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; @@ -401,12 +388,12 @@ struct typesAtomicCASImpl { uint16_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); if (target_value != compare) break; - T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val - : (old & 0xffff) | (T_int(u_val) << 16); + T_int new_value = + (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); @@ -416,8 +403,8 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; T_int ret = atomicCAS(reinterpret_cast(addr), @@ -431,8 +418,8 @@ struct typesAtomicCASImpl { // 8 bytes atomic operation template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -464,11 +451,10 @@ struct typesAtomicCASImpl { * @returns The old value at `address` * -------------------------------------------------------------------------**/ template -typename std::enable_if_t::value, T> __forceinline__ - __device__ - genericAtomicOperation(T* address, T const& update_value, BinaryOp op) { - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; +typename std::enable_if_t::value, T> __forceinline__ __device__ +genericAtomicOperation(T* address, T const& update_value, BinaryOp op) +{ + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -476,11 +462,11 @@ typename std::enable_if_t::value, T> __forceinline__ template __forceinline__ __device__ bool genericAtomicOperation(bool* address, bool const& update_value, - BinaryOp op) { + BinaryOp op) +{ using T = bool; // don't use underlying type to apply operation for bool - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -502,9 +488,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceSum{}); +__forceinline__ __device__ T atomicAdd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); } /** @@ -523,9 +509,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMin{}); +__forceinline__ __device__ T atomicMin(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); } /** @@ -544,9 +530,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMax{}); +__forceinline__ __device__ T atomicMax(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); } /** @@ -566,9 +552,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, - val); +__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +{ + return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); } /** @@ -586,11 +572,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceAnd{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicAnd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); } /** @@ -608,11 +593,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) { - return raft::genericAtomicOperation(address, val, - raft::device_atomics::detail::DeviceOr{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicOr(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); } /** @@ -630,9 +614,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceXor{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicXor(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); } diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh index b87c295eb0..61622d7c87 100644 --- a/cpp/include/raft/distance/canberra.cuh +++ b/cpp/include/raft/distance/canberra.cuh @@ -44,75 +44,108 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch work */ -template -static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void canberraImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto add = raft::myAbs(x) + raft::myAbs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto canberraRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + auto canberraRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto canberraColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + auto canberraColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void canberra(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { canberraImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -137,16 +170,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void canberraImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - canberraOutType; + typedef typename std::conditional::type canberraOutType; Index_ lda, ldb, ldd; - canberraOutType *pDcast = reinterpret_cast(pD); + canberraOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; canberra( diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh index 8d53408cf8..b7ecdb945b 100644 --- a/cpp/include/raft/distance/chebyshev.cuh +++ b/cpp/include/raft/distance/chebyshev.cuh @@ -44,72 +44,105 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void chebyshevImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - acc = raft::myMax(acc, diff); + acc = raft::myMax(acc, diff); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto chebyshevRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevRowMajor); + auto chebyshevRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto chebyshevColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevColMajor); + auto chebyshevColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void chebyshev(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { chebyshevImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void chebyshevImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - chebyshevOutType; + typedef typename std::conditional::type chebyshevOutType; Index_ lda, ldb, ldd; - chebyshevOutType *pDcast = reinterpret_cast(pD); + chebyshevOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; chebyshev( diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index ed9bd28b7f..3e034e15d2 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -24,7 +24,7 @@ namespace distance { /** * @brief the cosine distance matrix calculation implementer - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -49,30 +49,43 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void cosineImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -85,43 +98,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto cosineRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + auto cosineRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto cosineColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + auto cosineColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, const DataT *yn, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void cosine(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { cosineImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,7 +166,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the expanded cosine distance matrix calculation - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam IType input data-type (for A and B matrices) * @tparam AccType accumulation data-type @@ -151,12 +187,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, AccType *workspace, - size_t worksize, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void cosineAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); }; // Wrap fin_op to allow computing 1 - pA before calling fin_op @@ -165,39 +212,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, }; typedef std::is_same is_bool; - typedef typename std::conditional::type - CosOutType; - CosOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type CosOutType; + CosOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, - stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream); } else { lda = n, ldb = m, ldd = m; - cosine(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, - wrapped_fin_op, stream); + cosine( + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream); } } diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 1b39a6ec18..1627753b43 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -32,140 +32,314 @@ namespace raft { namespace distance { namespace { -template struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {} + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg = 2.0f) + { + } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo1(m, n, k, x, y, dist, false, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo1( + m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo1(m, n, k, x, y, dist, true, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo1( + m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::cosineAlgo1( - m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo2(m, n, k, x, y, dist, false, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo2( + m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo2(m, n, k, x, y, dist, true, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo2( + m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::l1Impl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::chebyshevImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::chebyshevImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::hellingerImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::hellingerImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::minkowskiImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor, metric_arg); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::minkowskiImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::canberraImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } @@ -189,13 +363,15 @@ struct DistanceImpl -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - size_t worksize = 0; - constexpr bool is_allocated = - distanceType <= raft::distance::DistanceType::CosineExpanded; +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + size_t worksize = 0; + constexpr bool is_allocated = distanceType <= raft::distance::DistanceType::CosineExpanded; if (is_allocated) { worksize += m * sizeof(AccType); if (x != y) worksize += n * sizeof(AccType); @@ -228,17 +404,27 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, * as follows:
OutType fin_op(AccType in, int g_idx);
. If one needs * any other parameters, feel free to pass them via closure. */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - DistanceImpl - distImpl; - distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, - isRowMajor, metric_arg); +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + DistanceImpl distImpl; + distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -263,18 +449,26 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @note if workspace is passed as nullptr, this will return in * worksize, the number of bytes of workspace required */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { - return d_val; - }; - distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, - stream, isRowMajor, metric_arg); +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; + distance( + x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -298,39 +492,47 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, - cudaStream_t stream, bool isRowMajor, - Type metric_arg = 2.0f) { - auto worksize = - getWorkspaceSize(x, y, m, n, k); +void pairwise_distance_impl(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + raft::mr::device::buffer& workspace, + cudaStream_t stream, + bool isRowMajor, + Type metric_arg = 2.0f) +{ + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance(x, y, dist, m, n, k, - workspace.data(), worksize, - stream, isRowMajor, metric_arg); + distance( + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } template -void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, - raft::distance::DistanceType metric, cudaStream_t stream, - bool isRowMajor = true, Type metric_arg = 2.0f) { +void pairwise_distance(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + raft::mr::device::buffer& workspace, + raft::distance::DistanceType metric, + cudaStream_t stream, + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L1: @@ -338,13 +540,11 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2Unexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::Linf: @@ -352,22 +552,18 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg); break; case raft::distance::DistanceType::Canberra: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; - default: - THROW("Unknown or unsupported distance metric '%d'!", (int)metric); + default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } /** @} */ diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 484da0e5bf..46d0a1a4a9 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -48,30 +48,44 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, - IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanExpImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -93,47 +107,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto euclideanExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + auto euclideanExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + auto euclideanExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void euclideanExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanExpImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -161,53 +196,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - AccType *workspace, size_t &worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(InType in) { return in; }; typedef std::is_same is_bool; - typedef typename std::conditional::type - ExpOutType; - ExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type ExpOutType; + ExpOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, - fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, - fin_op, stream); + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); } } /** - * @brief the unexpanded euclidean distance matrix calculation + * @brief the unexpanded euclidean distance matrix calculation * It computes the following equation: cij = op((ai-bj)^2) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -227,16 +268,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -247,10 +302,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -263,48 +319,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto euclideanUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpRowMajor); + auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpColMajor); + auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanUnExpImpl( x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -330,15 +406,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo2(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - UnExpOutType; - UnExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type UnExpOutType; + UnExpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index b96a536e38..f80b4eb8f7 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -35,24 +35,24 @@ template struct KVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template struct MinAndDistanceReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) { + DI void operator()(LabelT rid, KVP* out, const KVP& other) + { if (other.value < out->value) { - out->key = other.key; + out->key = other.key; out->value = other.value; } } - DI void init(KVP* out, DataT maxVal) { - out->key = -1; + DI void init(KVP* out, DataT maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -60,30 +60,28 @@ struct MinAndDistanceReduceOp { template struct MinReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) { - if (other.value < *out) { - *out = other.value; - } + DI void operator()(LabelT rid, DataT* out, const KVP& other) + { + if (other.value < *out) { *out = other.value; } } DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { - redOp.init(min + tid, maxVal); - } + if (tid < m) { redOp.init(min + tid, maxVal); } } // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls -template -DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, - IdxT m, IdxT gridStrideY) { - const auto lid = threadIdx.x % raft::WarpSize; +template +DI void updateReducedVal( + int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) +{ + const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This @@ -108,21 +106,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; + val[i] = {tmpkey, tmpvalue}; } } } } -template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( - OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, - IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, - KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair KVPair; @@ -135,7 +150,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); @@ -164,72 +181,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; + KVPair tmp = {tmpkey, acc[i][j]}; if (tmpkey < n) { - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, - maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); + auto rowEpilog_lambda = + [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); // reduce #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + for (int j = P::AccThCols / 2; j > 0; j >>= 1) { + auto tmpkey = raft::shfl(val[i].key, lid + j); + auto tmpvalue = raft::shfl(val[i].value, lid + j); + KVPair tmp = {tmpkey, tmpvalue}; + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + } } - } - updateReducedVal(mutex, min, val, red_op, - m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, m, gridStrideY); // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } + }; IdxT lda = k, ldb = k, ldd = n; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + xn, + yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NNImpl(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + int* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ typedef typename linalg::Policy4x4::Policy P; dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { @@ -240,25 +290,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - constexpr size_t shmemSize = - P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - auto fusedL2NNSqrt = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + auto fusedL2NNSqrt = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); fusedL2NNSqrt<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } else { - auto fusedL2NN = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); - fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, - maxVal, workspace, redOp, - pairRedOp, core_lambda, fin_op); + auto fusedL2NN = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); @@ -299,25 +358,32 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, * main kernel launch * @param[in] stream cuda stream */ -template -void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NN(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + void* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } } diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh index f7ad3ed1ba..c8c7dad7d4 100644 --- a/cpp/include/raft/distance/hellinger.cuh +++ b/cpp/include/raft/distance/hellinger.cuh @@ -23,7 +23,7 @@ namespace distance { /** * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: + * It computes the following equation: cij = sqrt(1 - sum(sqrt(x_k * y_k))) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -51,29 +51,40 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void hellingerImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); - auto unaryOp_lambda = [] __device__(DataT input) { - return raft::mySqrt(input); - }; + auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); }; // First sqrt x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } // Accumulation operation lambda @@ -84,71 +95,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); + const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::mySqrt(rectifier * finalVal); } } }; if (isRowMajor) { - auto hellingerRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerRowMajor); + auto hellingerRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hellingerColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerColMajor); + auto hellingerColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } // Revert sqrt of x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } CUDA_CHECK(cudaGetLastError()); } -template -void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hellinger(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hellingerImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -157,7 +188,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hellinger distance matrix calculation - * It computes the following equation: + * It computes the following equation: sqrt(1 - sum(sqrt(x_k * y_k)) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -179,16 +210,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void hellingerImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hellingerOutType; + typedef typename std::conditional::type hellingerOutType; Index_ lda, ldb, ldd; - hellingerOutType *pDcast = reinterpret_cast(pD); + hellingerOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; hellinger( diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index 6ab084f041..268e269391 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -42,16 +42,29 @@ namespace distance { * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void l1Impl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -62,47 +75,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto l1RowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + auto l1RowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto l1ColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + auto l1ColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, - const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void l1(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + l1Impl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { l1Impl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,16 +165,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void l1Impl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void l1Impl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef - typename std::conditional::type L1OutType; + typedef typename std::conditional::type L1OutType; Index_ lda, ldb, ldd; - L1OutType *pDcast = reinterpret_cast(pD); + L1OutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; l1( diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh index 803f5fc78a..c021954f32 100644 --- a/cpp/include/raft/distance/minkowski.cuh +++ b/cpp/include/raft/distance/minkowski.cuh @@ -21,7 +21,7 @@ namespace raft { namespace distance { /** - * @brief the unexpanded Minkowski distance matrix calculation + * @brief the unexpanded Minkowski distance matrix calculation * It computes the following equation: cij = sum(|x - y|^p)^(1/p) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -44,16 +44,30 @@ namespace distance { * @param[in] stream cuda stream to launch work * @param[in] the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT p) { +template +void minkowskiUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT p) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -64,10 +78,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const auto one_over_p = 1.0f / p; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -79,48 +94,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto minkowskiUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpRowMajor); + auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto minkowskiUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpColMajor); + auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { +template +void minkowskiUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT metric_arg) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else { minkowskiUnExpImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); @@ -146,15 +181,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] isRowMajor whether the input and output matrices are row major * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +void minkowskiImpl(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - LpUnexpOutType; - LpUnexpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type LpUnexpOutType; + LpUnexpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 43abc9eb65..3db4dc0131 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -31,11 +31,11 @@ namespace distance { * @tparam OutT output data-type (for C and D matrices) * @tparam IdxT index data-type * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into + * @tparam CoreLambda tells how to accumulate an x and y into acc. its signature: template void core_lambda(AccT& acc, const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final + * @tparam EpilogueLambda applies an elementwise function to compute final values. Its signature is: template void epilogue_lambda (AccT acc[][], DataT* regxn, DataT* regyn); @@ -57,13 +57,19 @@ namespace distance { * @param fin_op the final gemm epilogue lambda */ -template > +template > struct PairwiseDistances : public BaseClass { private: typedef Policy P; @@ -81,11 +87,21 @@ struct PairwiseDistances : public BaseClass { public: // Constructor - DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, - const DataT* _xn, const DataT* _yn, OutT* _dOutput, - char* _smem, CoreLambda _core_op, - EpilogueLambda _epilog_op, FinalLambda _fin_op, + DI PairwiseDistances(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + const DataT* _xn, + const DataT* _yn, + OutT* _dOutput, + char* _smem, + CoreLambda _core_op, + EpilogueLambda _epilog_op, + FinalLambda _fin_op, rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), @@ -96,9 +112,12 @@ struct PairwiseDistances : public BaseClass { core_op(_core_op), epilog_op(_epilog_op), fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) {} + rowEpilog_op(_rowEpilog_op) + { + } - DI void run() { + DI void run() + { for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; @@ -112,7 +131,8 @@ struct PairwiseDistances : public BaseClass { } private: - DI void updateIndicesY() { + DI void updateIndicesY() + { const auto stride = P::Nblk * gridDim.x; if (isRowMajor) { this->y += stride * this->ldb; @@ -122,21 +142,23 @@ struct PairwiseDistances : public BaseClass { this->yrowid += stride; } - DI void updateIndicesXY() { + DI void updateIndicesXY() + { const auto stride = P::Mblk * gridDim.y; if (isRowMajor) { this->x += stride * this->lda; this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; - this->y = yBase + this->yrowid * this->ldb; + this->y = yBase + this->yrowid * this->ldb; } else { this->x += stride; this->yrowid = IdxT(blockIdx.x) * P::Nblk; - this->y = yBase + this->yrowid + this->srowid * this->ldb; + this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; } - DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) { + DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) + { // Fetch next grid stride ldg if within range if ((gridStrideX + gridDim.x * P::Nblk) < this->n) { updateIndicesY(); @@ -147,10 +169,9 @@ struct PairwiseDistances : public BaseClass { } } - DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { - if (gridStrideX == blockIdx.x * P::Nblk) { - this->ldgXY(0); - } + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) + { + if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -165,7 +186,8 @@ struct PairwiseDistances : public BaseClass { this->pageWr ^= 1; } - DI void loop() { + DI void loop() + { for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { this->ldgXY(kidx); accumulate(); // on the previous k-block @@ -182,7 +204,8 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } - DI void accumulate() { + DI void accumulate() + { #pragma unroll for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { this->ldsXY(ki); @@ -199,7 +222,8 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) + { if (useNorms) { DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); DataT* syNorm = (&sxNorm[P::Mblk]); @@ -207,13 +231,13 @@ struct PairwiseDistances : public BaseClass { // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * P::Nblk) { for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -288,42 +312,67 @@ struct PairwiseDistances : public BaseClass { * @param fin_op the final gemm epilogue lambda */ -template -__global__ __launch_bounds__( - Policy::Nthreads, - 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, - const DataT* _xn, const DataT* _yn, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT* dOutput, CoreLambda core_op, - EpilogueLambda epilog_op, - FinalLambda fin_op) { +template +__global__ __launch_bounds__(Policy::Nthreads, + 2) void pairwiseDistanceMatKernel(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + CoreLambda core_op, + EpilogueLambda epilog_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; auto rowEpilog = [] __device__(IdxT starty) { return; }; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, - epilog_op, fin_op, rowEpilog); + PairwiseDistances + obj( + x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); obj.run(); } template -dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { - const auto numSMs = raft::getMultiProcessorCount(); +dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) +{ + const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; - CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, func, P::Nthreads, sMemSize)); + CUDA_CHECK( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); int minGridSize = numSMs * numBlocksPerSm; - int yChunks = raft::ceildiv(m, P::Mblk); - int xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + int yChunks = raft::ceildiv(m, P::Mblk); + int xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { int i = 1; while (grid.y * i < minGridSize) { diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index c62f2e5f79..773b83ab13 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -31,14 +31,14 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(exception const& src) noexcept - : std::exception(), msg_(src.what()) { + exception(exception const& src) noexcept : std::exception(), msg_(src.what()) + { collect_call_stack(); } /** ctor from an input message */ - explicit exception(std::string const msg) noexcept - : std::exception(), msg_(std::move(msg)) { + explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg)) + { collect_call_stack(); } @@ -51,7 +51,8 @@ class exception : public std::exception { /** append call stack info to this exception's message for ease of debug */ // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept { + void collect_call_stack() noexcept + { #ifdef __GNUC__ constexpr int kMaxStackDepth = 64; void* stack[kMaxStackDepth]; // NOLINT @@ -90,16 +91,16 @@ struct logic_error : public raft::exception { // FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf( \ + errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ } while (0) // FIXME: Need to be replaced with RAFT_EXPECTS @@ -109,16 +110,15 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ - __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index dbe7e83189..bb7d22e079 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -65,29 +65,29 @@ class handle_t { }()), streams_(n_streams), device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) { + host_allocator_(std::make_shared()) + { create_resources(); } /** - * @brief Construct a light handle copy from another + * @brief Construct a light handle copy from another * user stream, cuda handles, comms and worker pool are not copied - * The user_stream of the returned handle is set to the specified stream - * of the other handle worker pool - * @param[in] stream_id stream id in `other` worker streams + * The user_stream of the returned handle is set to the specified stream + * of the other handle worker pool + * @param[in] stream_id stream id in `other` worker streams * to be set as user stream in the constructed handle * @param[in] n_streams number worker streams to be created */ - handle_t(const handle_t& other, int stream_id, - int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()), streams_(n_streams) { - RAFT_EXPECTS( - other.get_num_internal_streams() > 0, - "ERROR: the main handle must have at least one worker stream\n"); - prop_ = other.get_device_properties(); + handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()), streams_(n_streams) + { + RAFT_EXPECTS(other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); + prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); + device_allocator_ = other.get_device_allocator(); + host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); } @@ -99,25 +99,22 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } - rmm::cuda_stream_view get_stream_view() const { - return rmm::cuda_stream_view(user_stream_); - } + rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } - void set_device_allocator(std::shared_ptr allocator) { + void set_device_allocator(std::shared_ptr allocator) + { device_allocator_ = allocator; } - std::shared_ptr get_device_allocator() const { - return device_allocator_; - } + std::shared_ptr get_device_allocator() const { return device_allocator_; } - void set_host_allocator(std::shared_ptr allocator) { + void set_host_allocator(std::shared_ptr allocator) + { host_allocator_ = allocator; } - std::shared_ptr get_host_allocator() const { - return host_allocator_; - } + std::shared_ptr get_host_allocator() const { return host_allocator_; } - cublasHandle_t get_cublas_handle() const { + cublasHandle_t get_cublas_handle() const + { std::lock_guard _(mutex_); if (!cublas_initialized_) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); @@ -126,7 +123,8 @@ class handle_t { return cublas_handle_; } - cusolverDnHandle_t get_cusolver_dn_handle() const { + cusolverDnHandle_t get_cusolver_dn_handle() const + { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); @@ -135,7 +133,8 @@ class handle_t { return cusolver_dn_handle_; } - cusolverSpHandle_t get_cusolver_sp_handle() const { + cusolverSpHandle_t get_cusolver_sp_handle() const + { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); @@ -144,7 +143,8 @@ class handle_t { return cusolver_sp_handle_; } - cusparseHandle_t get_cusparse_handle() const { + cusparseHandle_t get_cusparse_handle() const + { std::lock_guard _(mutex_); if (!cusparse_initialized_) { CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -154,16 +154,13 @@ class handle_t { } // legacy compatibility for cuML - cudaStream_t get_internal_stream(int sid) const { - return streams_.get_stream(sid).value(); - } + cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); } // new accessor return rmm::cuda_stream_view - rmm::cuda_stream_view get_internal_stream_view(int sid) const { - return streams_.get_stream(sid); - } + rmm::cuda_stream_view get_internal_stream_view(int sid) const { return streams_.get_stream(sid); } int get_num_internal_streams() const { return streams_.get_pool_size(); } - std::vector get_internal_streams() const { + std::vector get_internal_streams() const + { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { int_streams_vec.push_back(get_internal_stream(i)); @@ -171,49 +168,51 @@ class handle_t { return int_streams_vec; } - void wait_on_user_stream() const { + void wait_on_user_stream() const + { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } - void wait_on_internal_streams() const { + void wait_on_internal_streams() const + { for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } - void set_comms(std::shared_ptr communicator) { - communicator_ = communicator; - } + void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } - const comms::comms_t& get_comms() const { - RAFT_EXPECTS(this->comms_initialized(), - "ERROR: Communicator was not initialized\n"); + const comms::comms_t& get_comms() const + { + RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); return *communicator_; } - void set_subcomm(std::string key, std::shared_ptr subcomm) { + void set_subcomm(std::string key, std::shared_ptr subcomm) + { subcomms_[key] = subcomm; } - const comms::comms_t& get_subcomm(std::string key) const { - RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(), - "%s was not found in subcommunicators.", key.c_str()); + const comms::comms_t& get_subcomm(std::string key) const + { + RAFT_EXPECTS( + subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); auto subcomm = subcomms_.at(key); - RAFT_EXPECTS(nullptr != subcomm.get(), - "ERROR: Subcommunicator was not initialized"); + RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); return *subcomm; } bool comms_initialized() const { return (nullptr != communicator_.get()); } - const cudaDeviceProp& get_device_properties() const { + const cudaDeviceProp& get_device_properties() const + { std::lock_guard _(mutex_); if (!device_prop_initialized_) { CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); @@ -244,29 +243,28 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { - CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } + void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } - void destroy_resources() { + void destroy_resources() + { ///@todo: enable *_NO_THROW variants once we have enabled logging if (cusparse_initialized_) { - //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); } if (cusolver_dn_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); } if (cusolver_sp_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); } if (cublas_initialized_) { - //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } }; // class handle_t @@ -276,7 +274,8 @@ class handle_t { */ class stream_syncer { public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) { + explicit stream_syncer(const handle_t& handle) : handle_(handle) + { handle_.wait_on_user_stream(); } ~stream_syncer() { handle_.wait_on_internal_streams(); } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index a7cfb9287b..5fc56de14b 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -34,15 +34,13 @@ namespace raft { * `modulus` is positive. */ template -inline S round_up_safe(S number_to_round, S modulus) { +inline S round_up_safe(S number_to_round, S modulus) +{ auto remainder = number_to_round % modulus; - if (remainder == 0) { - return number_to_round; - } + if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument( - "Attempt to round up beyond the type's maximum value"); + throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); } return rounded_up; } @@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) { * `modulus` is positive. */ template -inline S round_down_safe(S number_to_round, S modulus) { - auto remainder = number_to_round % modulus; +inline S round_down_safe(S number_to_round, S modulus) +{ + auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } @@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) { * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, - const T& divisor) noexcept { +constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +{ return (dividend + divisor - 1) / divisor; } namespace detail { template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { + I dividend, + I divisor) noexcept +{ // TODO: This could probably be implemented faster - return (dividend > divisor) - ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); + return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); } template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { - auto quotient = dividend / divisor; + I dividend, + I divisor) noexcept +{ + auto quotient = dividend / divisor; auto remainder = dividend % divisor; return quotient + (remainder != 0); } @@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline std::enable_if_t::value, I> -div_rounding_up_safe(I dividend, I divisor) noexcept { - using i_is_a_signed_type = - std::integral_constant::value>; +constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( + I dividend, I divisor) noexcept +{ + using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline std::enable_if_t::value, bool> -is_a_power_of_two(I val) noexcept { +constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( + I val) noexcept +{ return ((val - 1) & val) == 0; } @@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept { * @return Absolute value if value type is signed. */ template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return std::abs(value); } // Unsigned type just returns itself. template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return value; } diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0da7da2eb6..0bbfa2bb3c 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -43,33 +43,35 @@ namespace label { * \param [in] allocator device allocator */ template -void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, +void getUniquelabels(value_t* y, + size_t n, + value_t** y_unique, + int* n_unique, cudaStream_t stream, - std::shared_ptr allocator) { + std::shared_ptr allocator) +{ raft::mr::device::buffer y2(allocator, stream, n); raft::mr::device::buffer y3(allocator, stream, n); raft::mr::device::buffer d_num_selected(allocator, stream, 1); - size_t bytes = 0; + size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); raft::mr::device::buffer cub_storage(allocator, stream, bytes); // Select Unique classes cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique( + cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); raft::update_host(n_unique, d_num_selected.data(), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output - *y_unique = - (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); + *y_unique = (value_t*)allocator->allocate(*n_unique * sizeof(value_t), stream); raft::copy(*y_unique, y3.data(), *n_unique, stream); } @@ -92,16 +94,17 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, * \param [in] stream cuda stream */ template -void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, - value_t *y_out, int idx, cudaStream_t stream) { +void getOvrlabels( + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) +{ ASSERT(idx < n_classes, "Parameter idx should not be larger than the number " "of classes"); raft::linalg::unaryOp( - y_out, y, n, - [idx, y_unique] __device__(value_t y) { - return y == y_unique[idx] ? +1 : -1; - }, + y_out, + y, + n, + [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -110,9 +113,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, - Type *out, size_t N, Lambda filter_op, - bool zero_based = false) { +__global__ void map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) +{ int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (!filter_op(in[tid])) { @@ -127,68 +135,75 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, +void make_monotonic(Type* out, + Type* in, + size_t N, + cudaStream_t stream, Lambda filter_op, std::shared_ptr allocator, - bool zero_based = false) { + bool zero_based = false) +{ static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type *map_ids; + Type* map_ids; int num_clusters; getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); - map_label_kernel<<>>( - map_ids, num_clusters, in, out, N, filter_op, zero_based); + map_label_kernel + <<>>(map_ids, num_clusters, in, out, N, filter_op, zero_based); allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, +void make_monotonic(Type* out, + Type* in, + size_t N, + cudaStream_t stream, std::shared_ptr allocator, - bool zero_based = false) { + bool zero_based = false) +{ make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, - zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, allocator, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index bed74581a2..1ee0659b0d 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, value_idx* __restrict__ R, - const bool* __restrict__ mask, bool* __restrict__ m, - value_idx N) { + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (__ldg((char*)mask + tid)) { @@ -65,15 +67,17 @@ template __global__ void __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, value_idx N, - value_idx MAX_LABEL) { + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { // Note: labels are from 1 to N - value_idx la = labels_a[tid]; - value_idx lb = __ldg(labels_b + tid); - value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; - value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; + value_idx la = labels_a[tid]; + value_idx lb = __ldg(labels_b + tid); + value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; + value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; labels_a[tid] = min(ra, rb); } } @@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X) * @param[in] stream CUDA stream */ template -void merge_labels(value_idx* labels_a, const value_idx* labels_b, - const bool* mask, value_idx* R, bool* m, value_idx N, - cudaStream_t stream) { +void merge_labels(value_idx* labels_a, + const value_idx* labels_b, + const bool* mask, + value_idx* R, + bool* m, + value_idx N, + cudaStream_t stream) +{ dim3 blocks(raft::ceildiv(N, value_idx(TPB_X))); dim3 threads(TPB_X); value_idx MAX_LABEL = std::numeric_limits::max(); diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h index ed545b7198..e488dc528f 100644 --- a/cpp/include/raft/lap/d_structs.h +++ b/cpp/include/raft/lap/d_structs.h @@ -26,18 +26,18 @@ template struct Vertices { - vertex_t *row_assignments; - vertex_t *col_assignments; - int *row_covers; - int *col_covers; - weight_t *row_duals; - weight_t *col_duals; - weight_t *col_slacks; + vertex_t* row_assignments; + vertex_t* col_assignments; + int* row_covers; + int* col_covers; + weight_t* row_duals; + weight_t* col_duals; + weight_t* col_slacks; }; template struct VertexData { - vertex_t *parents; - vertex_t *children; - int *is_visited; + vertex_t* parents; + vertex_t* children; + int* is_visited; }; diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 6bc1c08029..64b6a31efb 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -38,12 +38,12 @@ class LinearAssignmentProblem { vertex_t batchsize_; weight_t epsilon_; - weight_t const *d_costs_; + weight_t const* d_costs_; Vertices d_vertices_dev; VertexData d_row_data_dev, d_col_data_dev; - raft::handle_t const &handle_; + raft::handle_t const& handle_; raft::mr::device::buffer row_covers_v; raft::mr::device::buffer col_covers_v; raft::mr::device::buffer row_duals_v; @@ -59,8 +59,10 @@ class LinearAssignmentProblem { raft::mr::device::buffer obj_val_dual_v; public: - LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, - vertex_t batchsize, weight_t epsilon) + LinearAssignmentProblem(raft::handle_t const& handle, + vertex_t size, + vertex_t batchsize, + weight_t epsilon) : handle_(handle), size_(size), batchsize_(batchsize), @@ -78,11 +80,13 @@ class LinearAssignmentProblem { row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} + obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) + { + } // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, - vertex_t *d_col_assignment) { + void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) + { initializeDevice(); d_vertices_dev.row_assignments = d_row_assignment; @@ -94,27 +98,13 @@ class LinearAssignmentProblem { while (step != 100) { switch (step) { - case 0: - step = hungarianStep0(); - break; - case 1: - step = hungarianStep1(); - break; - case 2: - step = hungarianStep2(); - break; - case 3: - step = hungarianStep3(); - break; - case 4: - step = hungarianStep4(); - break; - case 5: - step = hungarianStep5(); - break; - case 6: - step = hungarianStep6(); - break; + case 0: step = hungarianStep0(); break; + case 1: step = hungarianStep1(); break; + case 2: step = hungarianStep2(); break; + case 3: step = hungarianStep3(); break; + case 4: step = hungarianStep4(); break; + case 5: step = hungarianStep5(); break; + case 6: step = hungarianStep6(); break; } } @@ -122,36 +112,39 @@ class LinearAssignmentProblem { } // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const { + std::pair getRowDualVector(int spId) const + { return std::make_pair(row_duals_v.data() + spId * size_, size_); } // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) { + std::pair getColDualVector(int spId) + { return std::make_pair(col_duals_v.data() + spId * size_, size_); } // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) { + weight_t getPrimalObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) { + weight_t getDualObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } private: // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() { + void initializeDevice() + { row_covers_v.resize(batchsize_ * size_); col_covers_v.resize(batchsize_ * size_); row_duals_v.resize(batchsize_ * size_); @@ -169,39 +162,36 @@ class LinearAssignmentProblem { d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); d_vertices_dev.col_slacks = col_slacks_v.data(); d_row_data_dev.is_visited = row_is_visited_v.data(); d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), - int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), - int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), - weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), - weight_t{0}); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, - size_); + int hungarianStep0() + { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); return 1; } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() { - detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev, - batchsize_, size_, epsilon_); + int hungarianStep1() + { + detail::computeInitialAssignments( + handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); int next = 2; @@ -217,10 +207,10 @@ class LinearAssignmentProblem { } // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() { - int cover_count = - detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + int hungarianStep2() + { + int cover_count = detail::computeRowCovers( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); int next = (cover_count == batchsize_ * size_) ? 6 : 3; @@ -228,17 +218,23 @@ class LinearAssignmentProblem { } // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() { + int hungarianStep3() + { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), - handle_.get_stream(), 1); + raft::mr::device::buffer flag_v(handle_.get_device_allocator(), handle_.get_stream(), 1); bool h_flag = false; raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); - detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, flag_v.data(), batchsize_, size_, + detail::executeZeroCover(handle_, + d_costs_, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + flag_v.data(), + batchsize_, + size_, epsilon_); raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); @@ -249,31 +245,36 @@ class LinearAssignmentProblem { } // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, - size_); + int hungarianStep4() + { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + detail::augmentationPass( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); return 2; } // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() { - detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, - batchsize_, size_, epsilon_); + int hungarianStep5() + { + detail::dualUpdate( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); return 3; } // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() { - detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_, - d_vertices_dev.row_assignments, batchsize_, size_); + int hungarianStep6() + { + detail::calcObjValPrimal(handle_, + obj_val_primal_v.data(), + d_costs_, + d_vertices_dev.row_assignments, + batchsize_, + size_); - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, - batchsize_, size_); + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); return 100; } diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 0079f50e82..9bbd44bf09 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -46,20 +46,26 @@ const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; // Function for calculating grid and block dimensions from the given input size. -inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateLinearDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX * BLOCKDIMY; int value = size / threads_per_block.x; if (size % threads_per_block.x > 0) value++; - total_blocks = value; + total_blocks = value; blocks_per_grid.x = value; } // Function for calculating grid and block dimensions from the given input size for square grid. -inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateSquareDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -68,15 +74,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX); int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY); - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } -// Function for calculating grid and block dimensions from the given input size for rectangular grid. -inline void calculateRectangularDims(dim3 &blocks_per_grid, - dim3 &threads_per_block, int &total_blocks, - int xsize, int ysize) { +// Function for calculating grid and block dimensions from the given input size for rectangular +// grid. +inline void calculateRectangularDims( + dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -86,16 +93,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid, int valuey = ysize / threads_per_block.y; if (ysize % threads_per_block.y > 0) valuey++; - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } template -inline void initialReduction(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices_dev, - int SP, vertex_t N) { +inline void initialReduction(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices_dev, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -103,34 +112,38 @@ inline void initialReduction(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_rowReduction<<>>( - d_costs, d_vertices_dev.row_duals, SP, N, - std::numeric_limits::max()); + kernel_rowReduction<<>>( + d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); - kernel_columnReduction<<>>( - d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N, + kernel_columnReduction<<>>( + d_costs, + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + SP, + N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); } template -inline void computeInitialAssignments(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices, - int SP, vertex_t N, weight_t epsilon) { +inline void computeInitialAssignments(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); + raft::mr::device::buffer row_lock_v( + handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer col_lock_v( + handle.get_device_allocator(), handle.get_stream(), size); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -140,21 +153,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeInitialAssignments<<>>( - d_costs, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(), - col_lock_v.data(), SP, N, epsilon); + kernel_computeInitialAssignments<<>>( + d_costs, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.row_assignments, + d_vertices.col_assignments, + row_lock_v.data(), + col_lock_v.data(), + SP, + N, + epsilon); CHECK_CUDA(handle.get_stream()); } // Function for finding row cover on individual devices. template -inline int computeRowCovers(raft::handle_t const &handle, - Vertices &d_vertices, - VertexData &d_row_data, - VertexData &d_col_data, int SP, - vertex_t N) { +inline int computeRowCovers(raft::handle_t const& handle, + Vertices& d_vertices, + VertexData& d_row_data, + VertexData& d_col_data, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -163,8 +184,7 @@ inline int computeRowCovers(raft::handle_t const &handle, thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0}); thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0}); - thrust::fill_n(thrust::device, d_vertices.col_slacks, size, - std::numeric_limits::max()); + thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits::max()); thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1}); @@ -174,25 +194,28 @@ inline int computeRowCovers(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeRowCovers<<>>( - d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, - SP, N); + kernel_computeRowCovers<<>>( + d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); - return thrust::reduce(thrust::device, d_vertices.row_covers, - d_vertices.row_covers + size); + return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size); } // Function for covering the zeros in uncovered rows and expanding the frontier. template -inline void coverZeroAndExpand( - raft::handle_t const &handle, weight_t const *d_costs_dev, - vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, weight_t epsilon) { +inline void coverZeroAndExpand(raft::handle_t const& handle, + weight_t const* d_costs_dev, + vertex_t const* d_rows_csr_neighbors, + vertex_t const* d_rows_csr_ptrs, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; @@ -200,24 +223,34 @@ inline void coverZeroAndExpand( raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_coverAndExpand<<>>( - d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev, - d_row_data_dev, d_col_data_dev, SP, N, epsilon); + kernel_coverAndExpand<<>>( + d_flag, + d_rows_csr_ptrs, + d_rows_csr_neighbors, + d_costs_dev, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + SP, + N, + epsilon); } template -inline vertex_t zeroCoverIteration(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, - weight_t epsilon) { +inline vertex_t zeroCoverIteration(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M; - raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), - handle.get_stream(), 0); + raft::mr::device::buffer csr_ptrs_v( + handle.get_device_allocator(), handle.get_stream(), 0); raft::mr::device::buffer csr_neighbors_v( handle.get_device_allocator(), handle.get_stream(), 0); @@ -226,8 +259,8 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); raft::mr::device::buffer addresses_v( handle.get_device_allocator(), handle.get_stream(), SP * N); @@ -242,87 +275,108 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. - kernel_rowPredicateConstructionCSR<<>>( - predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, - N); + predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); - thrust::exclusive_scan(thrust::device, addresses_v.begin(), - addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (M > 0) { csr_neighbors_v.resize(M); - kernel_rowScatterCSR<<>>( - predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(), - csr_ptrs_v.data(), M, SP, N); + kernel_rowScatterCSR<<>>( + predicates_v.data(), + addresses_v.data(), + csr_neighbors_v.data(), + csr_ptrs_v.data(), + M, + SP, + N); CHECK_CUDA(handle.get_stream()); } } if (M > 0) { - coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(), - csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + coverZeroAndExpand(handle, + d_costs_dev, + csr_neighbors_v.data(), + csr_ptrs_v.data(), + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + d_flag, + SP, + N, + epsilon); } return M; } -// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros. +// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending +// on the presence of uncovered zeros. template -inline void executeZeroCover(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, bool *d_flag, - int SP, vertex_t N, weight_t epsilon) { +inline void executeZeroCover(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M = 1; while (M > 0) { - M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + M = zeroCoverIteration( + handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon); } } // Function for executing reverse pass of the maximum matching. template -inline void reversePass(raft::handle_t const &handle, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, int N) { +inline void reversePass(raft::handle_t const& handle, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; std::size_t size = SP * N; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, size); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), size); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer addresses_v( + handle.get_device_allocator(), handle.get_stream(), size); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size); CHECK_CUDA(handle.get_stream()); // calculate total number of vertices. - std::size_t csr_size = - thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); + std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (csr_size > 0) { int total_blocks_1 = 0; @@ -334,14 +388,12 @@ inline void reversePass(raft::handle_t const &handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), csr_size); - kernel_augmentScatter<<>>( + kernel_augmentScatter<<>>( elements_v.data(), predicates_v.data(), addresses_v.data(), size); CHECK_CUDA(handle.get_stream()); - kernel_reverseTraversal<<>>( + kernel_reverseTraversal<<>>( elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size); CHECK_CUDA(handle.get_stream()); } @@ -349,27 +401,30 @@ inline void reversePass(raft::handle_t const &handle, // Function for executing augmentation pass of the maximum matching. template -inline void augmentationPass(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, - int N) { +inline void augmentationPass(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP * N); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); + raft::mr::device::buffer addresses_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N); @@ -380,8 +435,8 @@ inline void augmentationPass(raft::handle_t const &handle, vertex_t row_ids_csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (row_ids_csr_size > 0) { int total_blocks_1 = 0; @@ -393,17 +448,18 @@ inline void augmentationPass(raft::handle_t const &handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); - kernel_augmentScatter<<>>( - elements_v.data(), predicates_v.data(), addresses_v.data(), - vertex_t{SP * N}); + kernel_augmentScatter<<>>( + elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N}); CHECK_CUDA(handle.get_stream()); - kernel_augmentation<<>>( - d_vertices_dev.row_assignments, d_vertices_dev.col_assignments, - elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N}, + kernel_augmentation<<>>( + d_vertices_dev.row_assignments, + d_vertices_dev.col_assignments, + elements_v.data(), + d_row_data_dev, + d_col_data_dev, + vertex_t{N}, row_ids_csr_size); CHECK_CUDA(handle.get_stream()); @@ -411,35 +467,46 @@ inline void augmentationPass(raft::handle_t const &handle, } template -inline void dualUpdate(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, vertex_t N, - weight_t epsilon) { +inline void dualUpdate(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), - handle.get_stream(), 1); + raft::mr::device::buffer sp_min_v( + handle.get_device_allocator(), handle.get_stream(), 1); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); - kernel_dualUpdate_1<<>>( - sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP, - N, std::numeric_limits::max()); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + kernel_dualUpdate_1<<>>( + sp_min_v.data(), + d_vertices_dev.col_slacks, + d_vertices_dev.col_covers, + SP, + N, + std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_dualUpdate_2<<>>( - sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals, - d_vertices_dev.col_slacks, d_vertices_dev.row_covers, - d_vertices_dev.col_covers, d_row_data_dev.is_visited, - d_col_data_dev.parents, SP, N, std::numeric_limits::max(), + kernel_dualUpdate_2<<>>( + sp_min_v.data(), + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + d_vertices_dev.col_slacks, + d_vertices_dev.row_covers, + d_vertices_dev.col_covers, + d_row_data_dev.is_visited, + d_col_data_dev.parents, + SP, + N, + std::numeric_limits::max(), epsilon); CHECK_CUDA(handle.get_stream()); @@ -447,18 +514,19 @@ inline void dualUpdate(raft::handle_t const &handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, - Vertices &d_vertices_dev, int SP, - int N) { +inline void calcObjValDual(raft::handle_t const& handle, + weight_t* d_obj_val, + Vertices& d_vertices_dev, + int SP, + int N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValDual<<>>( + kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); CHECK_CUDA(handle.get_stream()); @@ -466,20 +534,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val, - weight_t const *d_costs, - vertex_t const *d_row_assignments, int SP, - vertex_t N) { +inline void calcObjValPrimal(raft::handle_t const& handle, + weight_t* d_obj_val, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValPrimal<<>>(d_obj_val, d_costs, - d_row_assignments, SP, N); + kernel_calcObjValPrimal<<>>( + d_obj_val, d_costs, d_row_assignments, SP, N); CHECK_CUDA(handle.get_stream()); } diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 8c9012ed72..45ad23afd1 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -48,42 +48,57 @@ const int AUGMENT{4}; const int MODIFIED{5}; template -bool __device__ near_zero(weight_t w, weight_t epsilon) { +bool __device__ near_zero(weight_t w, weight_t epsilon) +{ return ((w > -epsilon) && (w < epsilon)); } template <> -bool __device__ near_zero(int32_t w, int32_t epsilon) { +bool __device__ near_zero(int32_t w, int32_t epsilon) +{ return (w == 0); } template <> -bool __device__ near_zero(int64_t w, int64_t epsilon) { +bool __device__ near_zero(int64_t w, int64_t epsilon) +{ return (w == 0); } -// Device function for traversing the neighbors from start pointer to end pointer and updating the covers. -// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution. +// Device function for traversing the neighbors from start pointer to end pointer and updating the +// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of +// Step 4 execution. template -__device__ void cover_and_expand_row( - weight_t const *d_elements, weight_t const *d_row_duals, - weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers, - int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag, - vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited, - int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N, - weight_t epsilon) { +__device__ void cover_and_expand_row(weight_t const* d_elements, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + weight_t* d_col_slacks, + int* d_row_covers, + int* d_col_covers, + vertex_t const* d_col_assignments, + bool* d_flag, + vertex_t* d_row_parents, + vertex_t* d_col_parents, + int* d_row_visited, + int* d_col_visited, + vertex_t rowid, + int spid, + int colid, + vertex_t N, + weight_t epsilon) +{ int ROWID = spid * N + rowid; int COLID = spid * N + colid; - weight_t slack = d_elements[spid * N * N + rowid * N + colid] - - d_row_duals[ROWID] - d_col_duals[COLID]; + weight_t slack = + d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID]; int nxt_rowid = d_col_assignments[COLID]; int NXT_ROWID = spid * N + nxt_rowid; if (rowid != nxt_rowid && d_col_covers[COLID] == 0) { if (slack < d_col_slacks[COLID]) { - d_col_slacks[COLID] = slack; + d_col_slacks[COLID] = slack; d_col_parents[COLID] = ROWID; } @@ -92,13 +107,12 @@ __device__ void cover_and_expand_row( d_row_parents[NXT_ROWID] = COLID; // update parent info d_row_covers[NXT_ROWID] = 0; - d_col_covers[COLID] = 1; + d_col_covers[COLID] = 1; - if (d_row_visited[NXT_ROWID] != VISITED) - d_row_visited[NXT_ROWID] = ACTIVE; + if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE; } else { d_col_visited[COLID] = REVERSE; - *d_flag = true; + *d_flag = true; } } } @@ -107,28 +121,34 @@ __device__ void cover_and_expand_row( // Device function for traversing an alternating path from unassigned row to unassigned column. template -__device__ void __reverse_traversal( - int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children, - vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) { +__device__ void __reverse_traversal(int* d_row_visited, + vertex_t* d_row_children, + vertex_t* d_col_children, + vertex_t const* d_row_parents, + vertex_t const* d_col_parents, + int cur_colid) +{ int cur_rowid = -1; while (cur_colid != -1) { d_col_children[cur_colid] = cur_rowid; - cur_rowid = d_col_parents[cur_colid]; + cur_rowid = d_col_parents[cur_colid]; d_row_children[cur_rowid] = cur_colid; - cur_colid = d_row_parents[cur_rowid]; + cur_colid = d_row_parents[cur_rowid]; } d_row_visited[cur_rowid] = AUGMENT; } // Device function for augmenting the alternating path from unassigned column to unassigned row. template -__device__ void __augment(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_children, - vertex_t const *d_col_children, vertex_t cur_rowid, - vertex_t N) { +__device__ void __augment(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_children, + vertex_t const* d_col_children, + vertex_t cur_rowid, + vertex_t N) +{ int cur_colid = -1; while (cur_rowid != -1) { @@ -145,20 +165,18 @@ __device__ void __augment(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_rowReduction(weight_t const *d_costs, - weight_t *d_row_duals, int SP, vertex_t N, - weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; - int rowid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void kernel_rowReduction( + weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; + int rowid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && rowid < N) { for (int colid = 0; colid < N; colid++) { weight_t slack = d_costs[spid * N * N + rowid * N + colid]; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_row_duals[spid * N + rowid] = min; @@ -169,25 +187,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_columnReduction(weight_t const *d_costs, - weight_t const *d_row_duals, - weight_t *d_col_duals, int SP, - vertex_t N, weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_columnReduction(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t* d_col_duals, + int SP, + vertex_t N, + weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && colid < N) { for (int rowid = 0; rowid < N; rowid++) { - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[spid * N + rowid]; weight_t slack = cost - row_dual; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_col_duals[spid * N + colid] = min; @@ -196,12 +215,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs, // Kernel for calculating initial assignments. template -__global__ void kernel_computeInitialAssignments( - weight_t const *d_costs, weight_t const *d_row_duals, - weight_t const *d_col_duals, vertex_t *d_row_assignments, - vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeInitialAssignments(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + int* d_row_lock, + int* d_col_lock, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && colid < N) { @@ -213,15 +238,15 @@ __global__ void kernel_computeInitialAssignments( if (d_col_lock[overall_colid] == 1) break; - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[overall_rowid]; - weight_t slack = cost - row_dual - col_dual; + weight_t slack = cost - row_dual - col_dual; if (near_zero(slack, epsilon)) { if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) { d_row_assignments[overall_rowid] = colid; d_col_assignments[overall_colid] = rowid; - d_col_lock[overall_colid] = 1; + d_col_lock[overall_colid] = 1; } } } @@ -230,10 +255,10 @@ __global__ void kernel_computeInitialAssignments( // Kernel for populating the cover arrays and initializing alternating tree. template -__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, - int *d_row_covers, int *d_row_visited, - int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeRowCovers( + vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -249,11 +274,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, // Kernel for populating the predicate matrix for edges in row major format. template -__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, - vertex_t *d_addresses, - int *d_row_visited, int SP, - vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowPredicateConstructionCSR( + bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -261,130 +285,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, if (d_row_visited[index] == ACTIVE) { d_predicates[index] = true; - d_addresses[index] = 1; + d_addresses[index] = 1; } else { d_predicates[index] = false; - d_addresses[index] = 0; + d_addresses[index] = 0; } } } // Kernel for scattering the edges based on the scatter addresses. template -__global__ void kernel_rowScatterCSR(bool const *d_predicates, - vertex_t const *d_addresses, - vertex_t *d_neighbors, vertex_t *d_ptrs, - vertex_t M, int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowScatterCSR(bool const* d_predicates, + vertex_t const* d_addresses, + vertex_t* d_neighbors, + vertex_t* d_ptrs, + vertex_t M, + int SP, + vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { int index = spid * N + rowid; - bool predicate = d_predicates[index]; + bool predicate = d_predicates[index]; vertex_t compid = d_addresses[index]; - if (predicate) { - d_neighbors[compid] = rowid; - } + if (predicate) { d_neighbors[compid] = rowid; } if (rowid == 0) { d_ptrs[spid] = compid; - d_ptrs[SP] = M; + d_ptrs[SP] = M; } } } // Kernel for finding the minimum zero cover. template -__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs, - vertex_t const *d_neighbors, - weight_t const *d_elements, +__global__ void kernel_coverAndExpand(bool* d_flag, + vertex_t const* d_ptrs, + vertex_t const* d_neighbors, + weight_t const* d_elements, Vertices d_vertices, VertexData d_row_data, - VertexData d_col_data, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; + VertexData d_col_data, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; // Load values into local memory if (spid < SP && colid < N) { thrust::for_each( - thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1], - [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, - epsilon] __device__(vertex_t rowid) { - cover_and_expand_row( - d_elements, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers, - d_vertices.col_assignments, d_flag, d_row_data.parents, - d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited, - rowid, spid, colid, N, epsilon); + thrust::seq, + d_neighbors + d_ptrs[spid], + d_neighbors + d_ptrs[spid + 1], + [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__( + vertex_t rowid) { + cover_and_expand_row(d_elements, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.col_slacks, + d_vertices.row_covers, + d_vertices.col_covers, + d_vertices.col_assignments, + d_flag, + d_row_data.parents, + d_col_data.parents, + d_row_data.is_visited, + d_col_data.is_visited, + rowid, + spid, + colid, + N, + epsilon); }); } } // Kernel for constructing the predicates for reverse pass or augmentation candidates. template -__global__ void kernel_augmentPredicateConstruction(bool *d_predicates, - vertex_t *d_addresses, - int *d_visited, int size) { +__global__ void kernel_augmentPredicateConstruction(bool* d_predicates, + vertex_t* d_addresses, + int* d_visited, + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { int visited = d_visited[id]; if ((visited == REVERSE) || (visited == AUGMENT)) { d_predicates[id] = true; - d_addresses[id] = 1; + d_addresses[id] = 1; } else { d_predicates[id] = false; - d_addresses[id] = 0; + d_addresses[id] = 0; } } } // Kernel for scattering the vertices based on the scatter addresses. template -__global__ void kernel_augmentScatter(vertex_t *d_elements, - bool const *d_predicates, - vertex_t const *d_addresses, - std::size_t size) { +__global__ void kernel_augmentScatter(vertex_t* d_elements, + bool const* d_predicates, + vertex_t const* d_addresses, + std::size_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - if (d_predicates[id]) { - d_elements[d_addresses[id]] = id; - } + if (d_predicates[id]) { d_elements[d_addresses[id]] = id; } } } // Kernel for executing the reverse pass of the maximum matching algorithm. template -__global__ void kernel_reverseTraversal(vertex_t *d_elements, +__global__ void kernel_reverseTraversal(vertex_t* d_elements, VertexData d_row_data, VertexData d_col_data, - int size) { + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __reverse_traversal(d_row_data.is_visited, d_row_data.children, - d_col_data.children, d_row_data.parents, - d_col_data.parents, d_elements[id]); + __reverse_traversal(d_row_data.is_visited, + d_row_data.children, + d_col_data.children, + d_row_data.parents, + d_col_data.parents, + d_elements[id]); } } // Kernel for executing the augmentation pass of the maximum matching algorithm. template -__global__ void kernel_augmentation(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_elements, +__global__ void kernel_augmentation(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_elements, VertexData d_row_data, - VertexData d_col_data, vertex_t N, - vertex_t size) { + VertexData d_col_data, + vertex_t N, + vertex_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __augment(d_row_assignments, d_col_assignments, d_row_data.children, - d_col_data.children, d_row_elements[id], N); + __augment(d_row_assignments, + d_col_assignments, + d_row_data.children, + d_col_data.children, + d_row_elements[id], + N); } } @@ -392,18 +446,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_1(weight_t *d_sp_min, - weight_t const *d_col_slacks, - int const *d_col_covers, int SP, vertex_t N, - weight_t infinity) { +__global__ void kernel_dualUpdate_1(weight_t* d_sp_min, + weight_t const* d_col_slacks, + int const* d_col_covers, + int SP, + vertex_t N, + weight_t infinity) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { weight_t min = infinity; for (int colid = 0; colid < N; colid++) { - int index = spid * N + colid; + int index = spid * N + colid; weight_t slack = d_col_slacks[index]; - int col_cover = d_col_covers[index]; + int col_cover = d_col_covers[index]; if (col_cover == 0) if (slack < min) min = slack; @@ -417,21 +474,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_2( - weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals, - weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers, - int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N, - weight_t infinity, weight_t epsilon) { +__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, + weight_t* d_row_duals, + weight_t* d_col_duals, + weight_t* d_col_slacks, + int const* d_row_covers, + int const* d_col_covers, + int* d_row_visited, + vertex_t* d_col_parents, + int SP, + vertex_t N, + weight_t infinity, + weight_t epsilon) +{ int spid = blockIdx.y * blockDim.y + threadIdx.y; - int id = blockIdx.x * blockDim.x + threadIdx.x; + int id = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && id < N) { int index = spid * N + id; if (d_sp_min[spid] < infinity) { weight_t theta = d_sp_min[spid]; - int row_cover = d_row_covers[index]; - int col_cover = d_col_covers[index]; + int row_cover = d_row_covers[index]; + int col_cover = d_col_covers[index]; if (row_cover == 0) // Row vertex is reachable from source. d_row_duals[index] += theta; @@ -453,10 +518,12 @@ __global__ void kernel_dualUpdate_2( // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, - weight_t const *d_row_duals, - weight_t const *d_col_duals, int SP, - vertex_t N) { +__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { @@ -471,10 +538,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal, - weight_t const *d_costs, - vertex_t const *d_row_assignments, - int SP, vertex_t N) { +__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 7a454f64e2..11d3174951 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -37,8 +37,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; unaryOp(out, in, len, op, stream); } @@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void add(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @param stream cuda stream */ template -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 940d786e87..a49a433941 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -22,10 +22,10 @@ namespace raft { namespace linalg { -template -__global__ void binaryOpKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len, Lambda op) { +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a, b; @@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1, c.store(out, idx); } -template -void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, - IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, /** * @brief Checks if addresses are aligned on N bytes */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, - uint64_t N) { +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } @@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { - constexpr auto maxSize = - sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 8)) { + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 4)) { + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 2)) { + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, - op, stream); + binaryOpImpl(out, in1, in2, len, op, stream); } } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b5a93c4953..b129fe4758 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -122,9 +122,16 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; - math_t *A_new; - math_t *A_row; + math_t* A_new; + math_t* A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + CUBLAS_CHECK( + raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); @@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index ef983ff3d0..7e0744f98a 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -26,18 +26,27 @@ namespace linalg { // of the matrix, i.e. reduce along rows for row major or reduce along columns // for column major layout. Kernel does an inplace reduction adding to original // values of dots. -template -__global__ void coalescedReductionKernel(OutType *dots, const InType *data, - int D, int N, OutType init, +template +__global__ void coalescedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, MainLambda main_op, ReduceLambda reduce_op, FinalLambda final_op, - bool inplace = false) { + bool inplace = false) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; + IdxType rowStart = blockIdx.x * D; for (IdxType i = threadIdx.x; i < D; i += TPB) { IdxType idx = rowStart + i; thread_data = reduce_op(thread_data, main_op(data[idx], i)); @@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType *dots, const InType *data, int D, int N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ // One block per reduction // Efficient only for large leading dimensions if (D <= 32) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 64) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 128) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index aa711a9140..35d9d96ea4 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -55,8 +55,7 @@ namespace linalg { * thread block. This also determines the number of threads per * thread block */ -template +template struct KernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -101,8 +100,7 @@ struct KernelPolicy { }; // struct KernelPolicy -template +template struct ColKernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -151,7 +149,8 @@ struct ColKernelPolicy { * @{ */ template -struct Policy4x4 {}; +struct Policy4x4 { +}; template struct Policy4x4 { @@ -180,8 +179,7 @@ struct Policy4x4 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template +template struct Contractions_NT { protected: typedef Policy P; @@ -247,8 +245,7 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -265,7 +262,9 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) {} + pageRd(0) + { + } /** * @brief Ctor @@ -276,8 +275,15 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + DI Contractions_NT(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + char* _smem) : m(_m), n(_n), k(_k), @@ -291,17 +297,18 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) { + pageRd(0) + { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } @@ -310,7 +317,8 @@ struct Contractions_NT { * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ - DI void ldgXY(IdxT kidx) { + DI void ldgXY(IdxT kidx) + { ldgX(kidx); ldgY(kidx); } @@ -319,7 +327,8 @@ struct Contractions_NT { * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ - DI void stsXY() { + DI void stsXY() + { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } @@ -328,13 +337,15 @@ struct Contractions_NT { * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ - DI void ldsXY(int kidx) { + DI void ldsXY(int kidx) + { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) { + DI void ldgX(IdxT kidx) + { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -351,11 +362,10 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -367,7 +377,8 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) { + DI void ldgY(IdxT kidx) + { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -387,8 +398,7 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -400,7 +410,8 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) { + DI void stsX(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -408,7 +419,8 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) { + DI void stsY(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -416,7 +428,8 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) { + DI void ldsX(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -435,7 +448,8 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) { + DI void ldsY(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 7c79e6c91d..2d18691410 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,8 +25,7 @@ #include #define _CUBLAS_ERR_TO_STR(err) \ - case err: \ - return #err + case err: return #err namespace raft { @@ -34,15 +33,15 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const *const message) : raft::exception(message) {} - explicit cublas_error(std::string const &message) - : raft::exception(message) {} + explicit cublas_error(char const* const message) : raft::exception(message) {} + explicit cublas_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cublas_error_to_string(cublasStatus_t err) { +inline const char* cublas_error_to_string(cublasStatus_t err) +{ switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "CUBLAS_STATUS_UNKNOWN"; + default: return "CUBLAS_STATUS_UNKNOWN"; }; } @@ -71,16 +69,19 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ - do { \ - cublasStatus_t const status = (call); \ - if (CUBLAS_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "cuBLAS error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ } while (0) /** FIXME: temporary alias for cuML compatibility */ @@ -107,22 +108,39 @@ namespace linalg { * @{ */ template -cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, - const T *x, int incx, T *y, int incy, +cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const T* alpha, + const T* x, + int incx, + T* y, + int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const float *alpha, const float *x, int incx, - float *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const float* alpha, + const float* x, + int incx, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const double *alpha, const double *x, int incx, - double *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const double* alpha, + const double* x, + int incx, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } @@ -133,21 +151,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, * @{ */ template -cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y, - int incy, cudaStream_t stream); +cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -159,20 +177,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, * @{ */ template -cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, - T *y, int incy, cudaStream_t stream); +cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } @@ -183,31 +201,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, * @{ */ template -cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, - int m, int n, const T *alfa, const T *A, int lda, - const T *x, int incx, const T *beta, T *y, int incy, +cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* x, + int incx, + const T* beta, + T* y, + int incy, cudaStream_t stream); template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const float *alfa, const float *A, int lda, - const float *x, int incx, const float *beta, - float *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* x, + int incx, + const float* beta, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const double *alfa, const double *A, int lda, - const double *x, int incx, const double *beta, - double *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* x, + int incx, + const double* beta, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -216,23 +259,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, - const T *x, int incx, const T *y, int incy, T *A, - int lda, cudaStream_t stream); +cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const T* alpha, + const T* x, + int incx, + const T* y, + int incy, + T* A, + int lda, + cudaStream_t stream); template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const float *alpha, const float *x, int incx, - const float *y, int incy, float *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const float* alpha, + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const double *alpha, const double *x, int incx, - const double *y, int incy, double *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const double* alpha, + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -243,34 +310,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, * @{ */ template -cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const T *alfa, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc, +cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + int k, + const T* alfa, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const float *alfa, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const float* alfa, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const double *alfa, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const double* alfa, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -281,38 +376,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, template cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, - const T *const Aarray[], // NOLINT - int lda, const T *const Barray[], // NOLINT - int ldb, const T *beta, - T *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream); + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray[], // NOLINT + int lda, + const T* const Barray[], // NOLINT + int ldb, + const T* beta, + T* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream); template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, - const float *const Aarray[], // NOLINT - int lda, const float *const Barray[], // NOLINT - int ldb, const float *beta, float *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray[], // NOLINT + int lda, + const float* const Barray[], // NOLINT + int ldb, + const float* beta, + float* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasSgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, - const double *const Aarray[], // NOLINT - int lda, const double *const Barray[], // NOLINT - int ldb, const double *beta, double *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray[], // NOLINT + int lda, + const double* const Barray[], // NOLINT + int ldb, + const double* beta, + double* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasDgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } /** @} */ @@ -322,36 +472,110 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT */ template cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const T *alpha, const T *const Aarray, int lda, - int64_t strideA, const T *const Barray, int ldb, int64_t strideB, - const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray, + int lda, + int64_t strideA, + const T* const Barray, + int ldb, + int64_t strideB, + const T* beta, + T* Carray, + int ldc, + int64_t strideC, + int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, const float *const Aarray, int lda, - int64_t strideA, const float *const Barray, int ldb, int64_t strideB, - const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray, + int lda, + int64_t strideA, + const float* const Barray, + int ldb, + int64_t strideB, + const float* beta, + float* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasSgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, const double *const Aarray, int lda, - int64_t strideA, const double *const Barray, int ldb, int64_t strideB, - const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray, + int lda, + int64_t strideA, + const double* const Barray, + int ldb, + int64_t strideB, + const double* beta, + double* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasDgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } /** @} */ @@ -361,51 +585,85 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT */ template -cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT - T *const A[], // NOLINT - int lda, int *P, int *info, int batchSize, +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, + int n, // NOLINT + T* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, float *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + float* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, double *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + double* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } template -cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT - const T *const A[], // NOLINT - int lda, const int *P, - T *const C[], // NOLINT - int ldc, int *info, int batchSize, +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, + int n, // NOLINT + const T* const A[], // NOLINT + int lda, + const int* P, + T* const C[], // NOLINT + int ldc, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const float *const A[], // NOLINT - int lda, const int *P, float *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const float* const A[], // NOLINT + int lda, + const int* P, + float* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const double *const A[], // NOLINT - int lda, const int *P, double *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const double* const A[], // NOLINT + int lda, + const int* P, + double* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -419,34 +677,57 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT template inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, T *Aarray[], // NOLINT - int lda, T *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream); + cublasOperation_t trans, + int m, + int n, + int nrhs, + T* Aarray[], // NOLINT + int lda, + T* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream); template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, float *Aarray[], // NOLINT - int lda, float *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* Aarray[], // NOLINT + int lda, + float* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasSgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, double *Aarray[], // NOLINT - int lda, double *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* Aarray[], // NOLINT + int lda, + double* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasDgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } /** @} */ @@ -456,33 +737,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT * @{ */ template -cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, const T *alfa, - const T *A, int lda, const T *beta, const T *B, - int ldb, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* beta, + const T* B, + int ldb, + T* C, + int ldc, + cudaStream_t stream); template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const float *alfa, const float *A, int lda, - const float *beta, const float *B, int ldb, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* beta, + const float* B, + int ldb, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const double *alfa, const double *A, int lda, - const double *beta, const double *B, int ldb, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* beta, + const double* B, + int ldb, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -491,31 +798,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, const T *alpha, - const T *A, int lda, const T *B, int ldb, - const T *beta, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const T* alpha, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const float *alpha, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const double *alpha, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -524,27 +859,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, const T *alpha, - const T *A, int lda, const T *beta, T *C, int ldc, +cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const T* alpha, + const T* A, + int lda, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const float *alpha, const float *A, int lda, - const float *beta, float *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const double *alpha, const double *A, int lda, - const double *beta, double *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -555,52 +914,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, * @{ */ template -cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, - T *result, cudaStream_t stream); +cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream); template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, - int incx, float *result, cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, - int incx, double *result, - cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ template -cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, const T *alpha, - const T *A, int lda, T *B, int ldb, +cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const T* alpha, + const T* A, + int lda, + T* B, + int ldb, cudaStream_t stream); template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const float *alpha, const float *A, int lda, - float *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, + const float* A, + int lda, + float* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const double *alpha, const double *A, int lda, - double *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, + const double* A, + int lda, + double* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } /** @@ -608,21 +992,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, - const T *y, int incy, T *result, cudaStream_t stream); +cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const T* x, + int incx, + const T* y, + int incy, + T* result, + cudaStream_t stream); template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, - int incx, const float *y, int incy, - float *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, - int incx, const double *y, int incy, - double *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } @@ -642,7 +1044,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, // template<> inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } @@ -653,21 +1056,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, - int incx, cudaStream_t stream); +cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream); template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const float *alpha, float *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const double *alpha, double *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 0eadf47fe3..76a9f40f4d 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -24,8 +24,7 @@ #include #define _CUSOLVER_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; namespace raft { @@ -33,16 +32,15 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const *const message) - : raft::exception(message) {} - explicit cusolver_error(std::string const &message) - : raft::exception(message) {} + explicit cusolver_error(char const* const message) : raft::exception(message) {} + explicit cusolver_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cusolver_error_to_string(cusolverStatus_t err) { +inline const char* cusolver_error_to_string(cusolverStatus_t err) +{ switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); @@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); - default: - return "CUSOLVER_STATUS_UNKNOWN"; + default: return "CUSOLVER_STATUS_UNKNOWN"; }; } @@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::linalg::detail::cusolver_error_to_string(status)); \ throw raft::cusolver_error(msg); \ } \ @@ -107,42 +107,76 @@ namespace linalg { * @{ */ template -cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *Workspace, - int *devIpiv, int *devInfo, +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* Workspace, + int* devIpiv, + int* devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, int nrhs, - const T *A, int lda, const int *devIpiv, T *B, - int ldb, int *devInfo, cudaStream_t stream); + cublasOperation_t trans, + int n, + int nrhs, + const T* A, + int lda, + const int* devIpiv, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const float *A, int lda, - const int *devIpiv, float *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const float* A, + int lda, + const int* devIpiv, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const double *A, int lda, - const int *devIpiv, double *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const double* A, + int lda, + const int* devIpiv, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork) +{ return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork) +{ return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *info, syevjInfo_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* info, + syevjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, float *A, int lda, float *W, float *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, double *A, int lda, double *W, double *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork, + syevjInfo_t params); template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork, - syevjInfo_t params) { - return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork, - syevjInfo_t params) { - return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *devInfo, cudaStream_t stream); + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, float *A, - int lda, float *W, float *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, double *A, - int lda, double *W, double *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT /** * @defgroup syevdx cusolver syevdx operations * @{ -*/ + */ template cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, - int il, int iu, int *h_meig, const float *W, int *lwork) { - return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + const float* W, + int* lwork) +{ + return cusolverDnSsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, const double *W, int *lwork) { - return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + const double* W, + int* lwork) +{ + return cusolverDnDsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, - int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnSsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnDsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ #endif @@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT */ template cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int* lwork) +{ if (std::is_same, float>::value) { return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { @@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT } template cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, - T *rwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* VT, + int ldvt, + T* work, + int lwork, + T* rwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, - float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* VT, + int ldvt, + float* work, + int lwork, + float* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnSgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, - double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* VT, + int ldvt, + double* work, + int lwork, + double* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnDgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, - int *lwork, gesvdjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const T* A, + int lda, + const T* S, + const T* U, + int ldu, + const T* V, + int ldv, + int* lwork, + gesvdjInfo_t params); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const float *A, int lda, const float *S, const float *U, int ldu, - const float *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const float* A, + int lda, + const float* S, + const float* U, + int ldu, + const float* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnSgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const double *A, int lda, const double *S, const double *U, int ldu, - const double *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const double* A, + int lda, + const double* S, + const double* U, + int ldu, + const double* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnDgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, - int *info, gesvdjInfo_t params, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* V, + int ldv, + T* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, - float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* V, + int ldv, + float* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnSgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, - double *work, int lwork, int *info, gesvdjInfo_t params, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* V, + int ldv, + double* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnDgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } /** @} */ @@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT */ template cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, - int *Lwork); + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, T *A, - int lda, T *Workspace, int Lwork, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, float *A, - int lda, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, double *A, - int lda, double *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const T *A, int lda, T *B, int ldb, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + int nrhs, + const T* A, + int lda, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const float *A, int lda, float *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const float* A, + int lda, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const double *A, int lda, double *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const double* A, + int lda, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT * @{ */ template -cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *TAU, T *Workspace, - int Lwork, int *devInfo, cudaStream_t stream); +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* TAU, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *TAU, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* TAU, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *TAU, double *Workspace, - int Lwork, int *devInfo, - cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* TAU, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, - T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + int m, + int n, + int k, + T* A, + int lda, + const T* tau, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, - const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + float* A, + int lda, + const float* tau, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, - const double *tau, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + double* A, + int lda, + const double* tau, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, - const T *TAU, int *lwork); + cusolverDnHandle_t handle, + int m, + int n, + int k, + const T* A, + int lda, + const T* TAU, + int* lwork); template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, - const float *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const float* A, + int lda, + const float* TAU, + int* lwork) +{ return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, - const double *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const double* A, + int lda, + const double* TAU, + int* lwork) +{ return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT - cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, - const T *tau, T *C, int ldc, T *work, - int lwork, int *devInfo, cudaStream_t stream); + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + T* C, + int ldc, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, float *C, - int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + float* C, + int ldc, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, double *C, - int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + double* C, + int ldc, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, - int *lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + const T* C, + int ldc, + int* lwork); template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, - const float *C, int ldc, int *lwork) { - return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + const float* C, + int ldc, + int* lwork) +{ + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, - const double *C, int ldc, int *lwork) { - return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + const double* C, + int ldc, + int* lwork) +{ + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes); template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpScsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpScsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpDcsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpDcsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, - void *pBuffer, cudaStream_t stream); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* b, + T* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const float *b, float *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* b, + float* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpScsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const double *b, double *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* b, + double* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpDcsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } /** @} */ diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index c848ac1f4b..562a3d8991 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 6172618380..75e77ac0ce 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -41,26 +41,43 @@ namespace linalg { * @{ */ template -void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void eigDC(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, in, - n_cols, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + eig_vals, + &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); int dev_info; @@ -90,39 +107,80 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; * @{ */ template -void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, - EigVecMemUsage memUsage, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), - math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, - d_work.data(), lwork, d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } CUDA_CHECK(cudaGetLastError()); @@ -135,11 +193,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, "This usually occurs when some of the features do not vary enough."); if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, - stream); + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream); } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, - n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin( + d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream); } } @@ -160,10 +217,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @{ */ template -void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - auto allocator = handle.get_device_allocator(); +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + int sweeps = 15) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; @@ -172,23 +236,36 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, - eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + &lwork, + syevj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - dev_info.data(), syevj_params, stream)); + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + dev_info.data(), + syevj_params, + stream)); int executed_sweeps; - CUSOLVER_CHECK( - cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 1c6dee562d..097c3ac218 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -34,19 +34,17 @@ namespace linalg { * @{ */ template -void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); } template -void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); } /** @} */ @@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); } template -void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); } template -void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); } template -void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, + out, + in1, + in2, + len, [] __device__(InType a, InType b) { if (b == InType(0.0)) return InType(0.0); diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 0a4897cc0b..d5942b7446 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -43,35 +43,53 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, - cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, * @param beta scalar */ template -void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, - int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, - cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t trans_a, trans_b; @@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major // layout, trans_b needs to be CUBLAS_OP_N. trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; + ldb = isYColMajor == true ? _K : _N; - c = z; + c = z; ldc = _M; - M = _M; - N = _N; - K = _K; + M = _M; + N = _N; + K = _K; } else { // Result c is required in row major layout Thus we pick // a = y, b = x and c = a * b = y * x @@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // Set leading dimension appropriately ldb = isXColMajor == true ? _M : _K; - c = z; + c = z; ldc = _N; M = _N; @@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index edd18b3bee..a78480bb21 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -26,9 +26,19 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, - const math_t* x, int incx, math_t* y, int incy, bool trans_a, - math_t alpha, math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows, + int n_cols, + const math_t* x, + int incx, + math_t* y, + int incy, + bool trans_a, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -40,33 +50,47 @@ void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, // n - number of columns in input matrix // lda - purpose of it to have ability to operate on submatrices of matrix without copying. // If you're not think about it it's always should be equal to m - // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose + // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform + // transpose // In Machine Learning: // m - nunmber of columns in design matrix(number of features) // n - number of rows in designed matrix (number of train examples) - int m = n_rows; - int n = n_cols; + int m = n_rows; + int n = n_cols; int lda = trans_a ? m : n; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, - y, incy, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, y, incy, stream)); } template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* x, + math_t* y, + bool trans_a, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* x, + math_t* y, + bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index cb2e8ed1ab..2086172f5d 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -36,7 +36,8 @@ namespace { * \param [in] stream cuda stream */ template -void range(T *out, int start, int end, cudaStream_t stream) { +void range(T* out, int start, int end, cudaStream_t stream) +{ thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); @@ -53,7 +54,8 @@ void range(T *out, int start, int end, cudaStream_t stream) { * \param [in] stream cuda stream */ template -void range(T *out, int n, cudaStream_t stream) { +void range(T* out, int n, cudaStream_t stream) +{ range(out, 0, n, stream); } } // unnamed namespace diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index b775a1f696..39089473e3 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -//for cmath: +// for cmath: #define _USE_MATH_DEFINES #include @@ -40,14 +40,14 @@ using namespace linalg; namespace spectral { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - float *outputPtr, size_t n, - float mean, float stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - double *outputPtr, size_t n, - double mean, double stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t *iter, index_type_t maxIter, value_type_t shift, - value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev) { +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -117,29 +122,28 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, - stream)); + CUBLAS_CHECK(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, - beta_host, stream)); + CUBLAS_CHECK(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), - 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -151,65 +155,121 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, - lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(value_type_t), cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, n, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, - alpha_host + (*iter - 1), stream)); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, - beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -217,7 +277,7 @@ int performLanczosIteration( return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -235,8 +295,8 @@ int performLanczosIteration( * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t *v, value_type_t *Pv, - value_type_t *P) { +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = - std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t *v, value_type_t *A) { +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, value_type_t shift1, - value_type_t shift2, value_type_t *alpha, - value_type_t *beta, value_type_t *V, - value_type_t *work) { +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, - 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); + findHouseholder3(householder, beta + pos, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); + findHouseholder3(householder, beta + n - 4, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @return error flag. */ template -static int lanczosRestart( - handle_t const &handle, index_type_t n, index_type_t iter, - index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, bool smallest_eig) { +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -501,12 +577,12 @@ static int lanczosRestart( index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t *ritzVals_host = work_host + 3 * iter; + value_type_t* ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t *shifts_host; + value_type_t* shifts_host; // Orthonormal matrix for similarity transform - value_type_t *V_dev = work_dev + n * iter; + value_type_t* V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -524,7 +600,8 @@ static int lanczosRestart( // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) + V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -548,49 +625,71 @@ static int lanczosRestart( // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = - cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, - beta_host, V_host, work_host)) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = - beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, - n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, - &one, lanczosVecs_dev, n, V_dev, iter, &zero, - work_dev, n, stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, - beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, - lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -601,7 +700,7 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -651,19 +750,28 @@ static int lanczosRestart( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -683,21 +791,20 @@ int computeSmallestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -710,12 +817,11 @@ int computeSmallestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -738,10 +844,18 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -756,9 +870,17 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -775,9 +897,19 @@ int computeSmallestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -786,9 +918,17 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -799,39 +939,59 @@ int computeSmallestEigenvectors( } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -869,20 +1029,25 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ using namespace spectral; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -892,8 +1057,8 @@ int computeSmallestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -901,10 +1066,23 @@ int computeSmallestEigenvectors( // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = computeSmallestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -914,7 +1092,7 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -959,19 +1137,27 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -987,8 +1173,8 @@ int computeLargestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -998,15 +1184,14 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -1019,12 +1204,11 @@ int computeLargestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1044,13 +1228,21 @@ int computeLargestEigenvectors( CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t *shift = &shift_val; - - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1067,9 +1259,19 @@ int computeLargestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1078,9 +1280,17 @@ int computeLargestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1090,15 +1300,18 @@ int computeLargestEigenvectors( WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1123,36 +1336,52 @@ int computeLargestEigenvectors( //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync( - eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1190,18 +1419,23 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1211,18 +1445,30 @@ int computeLargestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = computeLargestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index aff08da2d3..200818fdc3 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -24,21 +24,18 @@ namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template -void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); @@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - mapImpl(out, len, map, stream, in, - args...); +void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index f2f198670a..78a7017c5c 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,50 +24,66 @@ namespace raft { namespace linalg { -struct sum_tag {}; +struct sum_tag { +}; template -__device__ void reduce(OutType *out, const InType acc, sum_tag) { +__device__ void reduce(OutType* out, const InType acc, sum_tag) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { - raft::myAtomicAdd(out, tmp); - } + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } template -__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { +__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { - raft::myAtomicReduce(out, tmp, op); - } + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } } -template -__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, - MapOp map, ReduceLambda op, - const InType *in, Args... args) { +template +__global__ void mapThenReduceKernel(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) +{ OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - acc = map(in[idx], args[idx]...); - } + if (idx < len) { acc = map(in[idx], args[idx]...); } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduceImpl(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel @@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, * @param args additional input arrays */ -template -void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ mapThenReduceImpl( out, len, (OutType)0, map, sum_tag(), stream, in, args...); } @@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 902816418f..98b5eaa809 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -23,10 +23,15 @@ namespace raft { namespace linalg { template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = threadIdx.x; @@ -57,17 +62,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType len = N * D; - IdxType nblks = - raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType len = N * D; + IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -89,11 +98,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); @@ -118,10 +134,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, ///@todo: come up with a cleaner interface to support these cases in future! template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector1, const Type *vector2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector1, + const Type* vector2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; @@ -154,15 +176,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -185,11 +213,19 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 9d1538c172..a3fcc5bac6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -24,7 +24,7 @@ namespace linalg { /** * @brief CUDA version mean squared error function mean((A-B)**2) * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block + * @tparam TPB threads-per-block * @param out the output mean squared error value (assumed to be a device pointer) * @param A input array (assumed to be a device pointer) * @param B input array (assumed to be a device pointer) @@ -33,14 +33,14 @@ namespace linalg { * @param stream cuda-stream where to launch this kernel */ template -void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, - math_t weight, cudaStream_t stream) { +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { math_t diff = a - b; return diff * diff * weight / len; }; - mapThenSumReduce(out, len, sq_diff, stream, A, - B); + mapThenSumReduce(out, len, sq_diff, stream, A, B); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index ce948c927d..53d57ecd00 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 64930a7123..82558c8023 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm }; * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void rowNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } @@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void colNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cafa8d54f1..c2455ac3a8 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -40,15 +40,19 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, - int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); raft::mr::device::buffer tau(allocator, stream, k); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); @@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK( - cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } /** @@ -84,30 +85,41 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; raft::mr::device::buffer R_full(allocator, stream, m * n); raft::mr::device::buffer tau(allocator, stream, min(m, n)); - CUDA_CHECK( - cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Lwork; raft::mr::device::buffer devInfo(allocator, stream, 1); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, - R_full_ncols, R_full.data(), - R_full_nrows, &Lwork)); + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, - tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, + R_full_nrows, + R_full_ncols, + R_full.data(), + R_full_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); @@ -115,17 +127,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, - min(Q_ncols, Q_nrows), Q, Q_nrows, - tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), - workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, + Q_nrows, + Q_ncols, + min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index d39577bbdd..693a797db9 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -52,28 +52,33 @@ namespace linalg { * @param reduce_op binary reduction operation * @param final_op elementwise operation to apply before storing results */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType *dots, const InType *data, int D, int N, OutType init, - bool rowMajor, bool alongRows, cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index bba652e137..f931c976fd 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -28,14 +28,15 @@ namespace linalg { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, - int N, Type init, MainLambda main_op) { +__global__ void stridedSummationKernel( + Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) +{ // Thread reduction Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; + int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; thread_data += main_op(data[idx], j); @@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, // Block reduction extern __shared__ char tmp[]; // One element per thread in block - Type *temp = (Type *)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; + Type* temp = (Type*)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { @@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicAdd(dots + colStart, temp[myidx]); + if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); } // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout -template -__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, - int N, OutType init, MainLambda main_op, - ReduceLambda reduce_op) { +template +__global__ void stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) +{ // Thread reduction OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; + IdxType stride = blockDim.y * gridDim.y; for (IdxType j = rowStart; j < N; j += stride) { IdxType idx = colStart + j * D; thread_data = reduce_op(thread_data, main_op(data[idx], j)); @@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto *temp = (OutType *)tmp; // Cast to desired type + extern __shared__ char tmp[]; // One element per thread in block + auto* temp = (OutType*)tmp; // Cast to desired type IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; + temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) - temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); __syncthreads(); } @@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ ///@todo: this extra should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) @@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, // Arbitrary numbers for now, probably need to tune const dim3 thrds(32, 16); IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; @@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, <<>>(dots, data, D, N, init, main_op); else stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, - reduce_op); + <<>>(dots, data, D, N, init, main_op, reduce_op); ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 882c105689..43060d0818 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -38,8 +38,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; unaryOp(out, in, len, op, stream); } @@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @remark block size has not been tuned */ template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 7357a68a4c..1cb8b7592f 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -50,14 +50,21 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { - std::shared_ptr allocator = - handle.get_device_allocator(); - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ + std::shared_ptr allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 // 46340: sqrt of max int value @@ -72,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int n = n_cols; raft::mr::device::buffer devInfo(allocator, stream, 1); - T *d_rwork = nullptr; + T* d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK( - cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); - char jobu = 'S'; + char jobu = 'S'; char jobvt = 'A'; if (!gen_left_vec) { @@ -92,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd( - cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, - right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngesvd(cusolverH, + jobu, + jobvt, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + d_rwork, + devInfo.data(), + stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); @@ -110,19 +130,37 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, } template -void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, - T *U, T *V, bool gen_left_vec, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; raft::mr::device::buffer in_cross_mult(allocator, stream, len); T alpha = T(1); - T beta = T(0); - raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), - n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + T beta = T(0); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + in, + in_cross_mult.data(), + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, stream); eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); @@ -133,10 +171,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { - raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); - raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, - true, stream); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + V, + U, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream); } } @@ -158,11 +206,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -177,18 +234,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, raft::mr::device::buffer devInfo(allocator, stream, 1); int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), - gesvdj_params, stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -207,18 +288,36 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, - math_t *V, math_t *out, int n_rows, int n_cols, int k, - cudaStream_t stream) { +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ auto allocator = handle.get_device_allocator(); const math_t alpha = 1.0, beta = 0.0; raft::mr::device::buffer SVT(allocator, stream, k * n_cols); - raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, - CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); } /** @@ -236,10 +335,18 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; @@ -263,16 +370,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; raft::mr::device::buffer A_minus_P(allocator, stream, m * n); - CUDA_CHECK( - cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, - &alpha, A_d, m, &beta, P_d.data(), m, - A_minus_P.data(), m, stream)); - - math_t norm_A_minus_P = - raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; + CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); + + math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index d90f6271fa..9b954c29c1 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -33,18 +33,34 @@ namespace linalg { * @param stream: cuda stream */ template -void transpose(const raft::handle_t &handle, math_t *in, math_t *out, - int n_rows, int n_cols, cudaStream_t stream) { +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); int out_n_rows = n_cols; int out_n_cols = n_rows; const math_t alpha = 1.0; - const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam( - cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, - n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } /** @@ -54,24 +70,25 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out, * @param stream: cuda stream */ template -void transpose(math_t *inout, int n, cudaStream_t stream) { - auto m = n; - auto size = n * n; - auto d_inout = inout; +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + auto m = n; + auto size = n * n; + auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 46b4d296cb..198b9b2b10 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -23,10 +23,9 @@ namespace raft { namespace linalg { -template -__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, - Lambda op) { +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a; @@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, b.store(out, idx); } -template -void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); } else if (1 / maxSize) { - unaryOpImpl( - out, in, len, op, stream); + unaryOpImpl(out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, - stream); + unaryOpImpl(out, in, len, op, stream); } } template -__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { - op(out + idx, idx); - } + if (idx < len) { op(out + idx, idx); } } /** @@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel - <<>>(out, len, op); + writeOnlyUnaryOpKernel<<>>(out, len, op); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 0a72117140..579491b5cc 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -41,14 +41,18 @@ namespace matrix { * @param stream cuda stream */ template -void power(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream) { - auto d_src = in; +void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) +{ + auto d_src = in; auto d_dest = out; raft::linalg::binaryOp( - d_dest, d_src, d_src, len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); + d_dest, + d_src, + d_src, + len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, + stream); } /** @@ -59,7 +63,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len, * @param stream cuda stream */ template -void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { +void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) +{ power(inout, inout, scalar, len, stream); } @@ -70,7 +75,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { * @param stream cuda stream */ template -void power(math_t *inout, int len, cudaStream_t stream) { +void power(math_t* inout, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(inout, scalar, len, stream); } @@ -84,7 +90,8 @@ void power(math_t *inout, int len, cudaStream_t stream) { * @{ */ template -void power(math_t *in, math_t *out, int len, cudaStream_t stream) { +void power(math_t* in, math_t* out, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(in, out, scalar, len, stream); } @@ -101,13 +108,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) { * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, - cudaStream_t stream, bool set_neg_zero = false) { - auto d_src = in; +void seqRoot(math_t* in, + math_t* out, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool set_neg_zero = false) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (set_neg_zero) { if (a < math_t(0)) { @@ -133,8 +147,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool set_neg_zero = false) { +void seqRoot( + math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) +{ seqRoot(inout, inout, scalar, len, stream, set_neg_zero); } @@ -148,22 +163,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(in, out, scalar, len, stream); } template -void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(inout, inout, scalar, len, stream); } template -void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, - cudaStream_t stream, math_t thres = 1e-15) { +void setSmallValuesZero( + math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ raft::linalg::unaryOp( - out, in, len, + out, + in, + len, [=] __device__(math_t a) { if (a <= thres && -a <= thres) { return math_t(0); @@ -184,8 +204,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, * @param thres: threshold */ template -void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, - math_t thres = 1e-15) { +void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ setSmallValuesZero(inout, inout, len, stream, thres); } @@ -203,14 +223,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, * @{ */ template -void reciprocal(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream, bool setzero = false, - math_t thres = 1e-15) { - auto d_src = in; +void reciprocal(math_t* in, + math_t* out, + math_t scalar, + int len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (setzero) { if (abs(a) <= thres) { @@ -237,8 +264,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len, * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0) */ template -void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool setzero = false, math_t thres = 1e-15) { +void reciprocal(math_t* inout, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ reciprocal(inout, inout, scalar, len, stream, setzero, thres); } @@ -251,7 +283,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(inout, scalar, len, stream); } @@ -266,14 +299,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { * @param stream cuda stream */ template -void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(in, out, scalar, len, stream); } template -void setValue(math_t *out, const math_t *in, math_t scalar, int len, - cudaStream_t stream = 0) { +void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) +{ raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); } @@ -289,46 +323,44 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param stream cuda stream */ template -void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, - cudaStream_t stream) { - auto d_src = src; +void ratio( + const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) +{ + auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = - handle.get_device_allocator(); + std::shared_ptr allocator = handle.get_device_allocator(); raft::mr::device::buffer d_sum(allocator, stream, 1); - auto *d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; + auto* d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, - stream); + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); } /** @} */ // Computes the argmax(d_in) column-wise in a DxN matrix template -__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { +__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(-1, -raft::myInf()); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - if (threadIdx.x == 0) { - argmax[blockIdx.x] = maxKV.key; - } + if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } } /** @@ -340,8 +372,8 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { * @param stream: cuda stream */ template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ int D = n_rows; int N = n_cols; if (D <= 32) { @@ -360,30 +392,29 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T *d_in, int D, int N) { +__global__ void signFlipKernel(T* d_in, int D, int N) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(0, 0); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); // flip column sign if d_in[maxIndex] < 0 __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { - need_sign_flip = d_in[maxKV.key] < T(0); - } + if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); } __syncthreads(); if (need_sign_flip) { for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; d_in[idx] = -d_in[idx]; } } @@ -398,9 +429,10 @@ __global__ void signFlipKernel(T *d_in, int D, int N) { * @param stream cuda stream */ template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ + int D = n_rows; + int N = n_cols; auto data = inout; if (D <= 32) { signFlipKernel<<>>(data, D, N); @@ -415,20 +447,43 @@ void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { } template -void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryMult(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, + stream); } template -void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream) { +void matrixVectorBinaryMultSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (b == Type(0)) return a; @@ -439,22 +494,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryDiv(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, + stream); } template -void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream, - bool return_zero = false) { +void matrixVectorBinaryDivSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream, + bool return_zero = false) +{ if (return_zero) { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return Type(0); @@ -464,7 +542,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, stream); } else { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return a; @@ -476,21 +560,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryAdd(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } template -void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinarySub(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 5f5755e24e..71a2888545 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -49,29 +49,33 @@ using namespace std; * @param rowMajor whether the matrix has row major layout */ template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ if (rowMajor) { const idx_t TPB = 256; - cache:: - get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); + cache::get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); CUDA_CHECK(cudaPeekAtLastError()); return; } - idx_t size = n_rows_indices * n_cols; + idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; - out[col * n_rows_indices + row] = - in[col * n_rows + indices[row]]; - }); + out[col * n_rows_indices + row] = in[col * n_rows + indices[row]]; + }); } /** @@ -83,8 +87,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * @param stream: cuda stream */ template -void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ raft::copy_async(out, in, n_rows * n_cols, stream); } @@ -99,21 +103,22 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, - idx_t out_n_cols, cudaStream_t stream) { - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; +void truncZeroOrigin( + m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) +{ + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); } /** @@ -125,24 +130,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, * @param stream: cuda stream */ template -void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -154,25 +160,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param stream: cuda stream */ template -void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -184,16 +191,16 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param v_separator: vertical separator character */ template -void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', - char v_separator = '\n') { +void print( + const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n') +{ std::vector h_matrix = std::vector(n_cols * n_rows); - CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), - cudaMemcpyDeviceToHost)); + CUDA_CHECK( + cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), cudaMemcpyDeviceToHost)); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], - j < n_cols - 1 ? h_separator : v_separator); + printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); } } } @@ -205,7 +212,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', * @param n_cols: number of columns of input matrix */ template -void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { +void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) +{ for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { printf("%1.4f ", in[j * n_rows + i]); @@ -226,8 +234,9 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { * (1-based) */ template -__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, - idx_t y1, idx_t x2, idx_t y2) { +__global__ void slice( + m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { @@ -251,8 +260,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, * @param stream: cuda stream */ template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ // Slicing dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); @@ -268,15 +285,13 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, - idx_t n_cols, idx_t k) { +__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; if (idx < m * n) { idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { - dst[i + j * k] = src[idx]; - } + if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; } } } @@ -289,8 +304,8 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, * @param stream: cuda stream */ template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t m = n_rows, n = n_cols; idx_t k = min(m, n); dim3 block(64); @@ -307,13 +322,11 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, - idx_t n, idx_t k) { +__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { - matrix[idx + idx * m] = vec[idx]; - } + if (idx < k) { matrix[idx + idx * m] = vec[idx]; } } /** @@ -325,13 +338,13 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, * @param stream: cuda stream */ template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t k = min(n_rows, n_cols); dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, - n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); } /** @@ -341,11 +354,10 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { +__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { - in[idx + idx * len] = 1.0 / in[idx + idx * len]; - } + if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } } /** @@ -355,7 +367,8 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) { * @param stream: cuda stream */ template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ dim3 block(64); dim3 grid((len + block.x - 1) / block.x); matrixDiagonalInverse<<>>(in, len); @@ -369,12 +382,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { * @param stream: cuda stream */ template -m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size, - cudaStream_t stream) { +m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - CUBLAS_CHECK( - raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + m_t normval = 0; + CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 29e0d7cfcd..18c8be5f45 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -35,11 +35,11 @@ namespace mr { template class buffer_base { public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; using const_reference = const T&; buffer_base() = delete; @@ -55,16 +55,12 @@ class buffer_base { * @param[in] stream cuda stream where this allocation operations are async * @param[in] n size of the buffer (in number of elements) */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, - size_type n = 0) - : data_(nullptr), - size_(n), - capacity_(n), - stream_(stream), - allocator_(std::move(allocator)) { + buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) + : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) + { if (capacity_ > 0) { - data_ = static_cast( - allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + data_ = + static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); CUDA_CHECK(cudaStreamSynchronize(stream_)); } } @@ -98,23 +94,23 @@ class buffer_base { * @param[in] stream cuda stream where allocation operations are queued * @{ */ - void reserve(size_type new_capacity) { + void reserve(size_type new_capacity) + { if (new_capacity > capacity_) { - auto* new_data = static_cast( - allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { - raft::copy(new_data, data_, size_, stream_); - } + auto* new_data = + static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } // Only deallocate if we have allocated a pointer if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = new_data; + data_ = new_data; capacity_ = new_capacity; } } - void reserve(size_type new_capacity, cudaStream_t stream) { + void reserve(size_type new_capacity, cudaStream_t stream) + { set_stream(stream); reserve(new_capacity); } @@ -127,12 +123,14 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void resize(const size_type new_size) { + void resize(const size_type new_size) + { reserve(new_size); size_ = new_size; } - void resize(const size_type new_size, cudaStream_t stream) { + void resize(const size_type new_size, cudaStream_t stream) + { set_stream(stream); resize(new_size); } @@ -146,16 +144,18 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void release() { + void release() + { if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = nullptr; + data_ = nullptr; capacity_ = 0; - size_ = 0; + size_ = 0; } - void release(cudaStream_t stream) { + void release(cudaStream_t stream) + { set_stream(stream); release(); } @@ -195,7 +195,8 @@ class buffer_base { * @param[in] stream new cuda stream to be set. If it is the same as the * current one, then this method will be a no-op. */ - void set_stream(cudaStream_t stream) { + void set_stream(cudaStream_t stream) + { if (stream_ != stream) { cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index 889e1640db..e930b617e0 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -32,17 +32,20 @@ namespace device { * further to the ones listed in `Allocator`: * - Allocations may be always on the device that was specified on construction. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default device allocator based on the one provided by RMM */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { rmm::mr::get_current_device_resource()->deallocate(p, n, stream); } }; // class default_allocator diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp index 39b5674ce4..2b9d84368f 100644 --- a/cpp/include/raft/mr/device/buffer.hpp +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -46,11 +46,11 @@ namespace device { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -60,7 +60,9 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } }; // class buffer }; // namespace device diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 8af266d4f0..62b6826211 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -34,20 +34,23 @@ namespace host { * further to the ones listed in `Allocator`: * - Allocations don't need to be zero copy accessible form a device. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default cudaMallocHost/cudaFreeHost based host allocator */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = nullptr; CUDA_CHECK(cudaMallocHost(&ptr, n)); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - //Must call _NO_THROW here since this is called frequently from object - //destructors which are "nothrow" by default + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { + // Must call _NO_THROW here since this is called frequently from object + // destructors which are "nothrow" by default CUDA_CHECK_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index 3c505bf2ed..52475ad6ec 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -48,11 +48,11 @@ namespace host { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -62,14 +62,15 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) { - if (other.size() > 0) { - raft::copy(data_, other.data(), other.size(), other.get_stream()); - } + : buffer_base(alloc, other.get_stream(), other.size()) + { + if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } } buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } reference operator[](size_type pos) { return data_[pos]; } diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 56710ea81f..5267770e8a 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -43,10 +43,9 @@ enum GeneratorType { GenKiss99 }; -template -__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda randOp) { +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -58,10 +57,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, } // used for Box-Muller type transformations -template -__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda2 rand2Op) { +template +__global__ void rand2Kernel( + uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -77,8 +76,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, } template -__global__ void constFillKernel(Type *ptr, int len, Type val) { - unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void constFillKernel(Type* ptr, int len, Type val) +{ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; const unsigned stride = gridDim.x * blockDim.x; for (unsigned idx = tid; idx < len; idx += stride) { ptr[idx] = val; @@ -99,19 +99,20 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) { * @{ */ template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ + constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; Type s, c; raft::mySinCos(theta, s, c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } /** @} */ @@ -131,7 +132,8 @@ class Rng { // simple heuristic to make sure all SMs will be occupied properly // and also not too many initialization calls will be made by each thread nBlocks(4 * getMultiProcessorCount()), - gen() { + gen() + { seed(_s); } @@ -142,7 +144,8 @@ class Rng { * function of timestamp. Another example is to use the c++11's * `std::random_device` for setting seed. */ - void seed(uint64_t _s) { + void seed(uint64_t _s) + { gen.seed(_s); offset = 0; } @@ -158,7 +161,8 @@ class Rng { * @param[out] b intercept parameter */ template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { // always keep 'a' to be coprime to 'n' a = gen() % n; while (gcd(a, n) != 1) { @@ -181,27 +185,24 @@ class Rng { * @{ */ template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return (val * (end - start)) + start; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; }, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'uniformInt' can only be integer!"); + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'uniformInt' can only be integer!"); custom_distribution( - ptr, len, - [=] __device__(IntType val, LenType idx) { - return (val % (end - start)) + start; - }, + ptr, + len, + [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; }, stream); } /** @} */ @@ -218,28 +219,37 @@ class Rng { * @{ */ template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'normal' can only be floating point!"); rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'normalInt' can only be integer!"); + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'normalInt' can only be integer!"); rand2Impl( - offset, ptr, len, - [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { + offset, + ptr, + len, + [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @} */ @@ -264,21 +274,32 @@ class Rng { * @param stream stream where to launch the kernel */ template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { rand2Impl( - offset, ptr, n_rows * n_cols, + offset, + ptr, + n_rows * n_cols, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { // yikes! use fast-int-div - auto col1 = idx1 % n_cols; - auto col2 = idx2 % n_cols; + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; auto mean1 = mu[col1]; auto mean2 = mu[col2]; - auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; - auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @@ -291,7 +312,8 @@ class Rng { * @param stream stream where to launch the kernel */ template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { constFillKernel<<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } @@ -309,10 +331,10 @@ class Rng { * @param[in] stream stream where to launch the kernel */ template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { custom_distribution( - ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, - stream); + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream); } /** @@ -326,15 +348,14 @@ class Rng { * @param stream stream where to launch the kernel */ template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'scaled_bernoulli' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return val > prob ? -scale : scale; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, stream); } @@ -350,12 +371,12 @@ class Rng { * @note https://en.wikipedia.org/wiki/Gumbel_distribution */ template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return mu - beta * raft::myLog(-raft::myLog(val)); - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); }, stream); } @@ -370,16 +391,21 @@ class Rng { * @param stream stream where to launch the kernel */ template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); val1 = raft::myExp(val1); val2 = raft::myExp(val2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @@ -393,10 +419,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return mu - scale * raft::myLog(one / val - one); @@ -414,9 +441,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return -raft::myLog(one - val) / lambda; @@ -434,9 +463,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; constexpr Type two = (Type)2.0; @@ -456,13 +487,14 @@ class Rng { * @param stream stream where to launch the kernel */ template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { - constexpr Type one = (Type)1.0; - constexpr Type two = (Type)2.0; + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; constexpr Type oneHalf = (Type)0.5; Type out; if (val <= oneHalf) { @@ -502,43 +534,44 @@ class Rng { * @param stream cuda stream */ template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - ASSERT(sampledLen <= len, - "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - - std::shared_ptr allocator = - handle.get_device_allocator(); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + + std::shared_ptr allocator = handle.get_device_allocator(); raft::mr::device::buffer expWts(allocator, stream, len); raft::mr::device::buffer sortedWts(allocator, stream, len); raft::mr::device::buffer inIdx(allocator, stream, len); raft::mr::device::buffer outIdxBuff(allocator, stream, len); - auto *inIdxPtr = inIdx.data(); + auto* inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( - expWts.data(), len, + expWts.data(), + len, [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { - inIdxPtr[idx] = idx; + inIdxPtr[idx] = idx; constexpr WeightsT one = (WeightsT)1.0; - auto exp = -raft::myLog(one - val); - if (wts != nullptr) { - return exp / wts[idx]; - } + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { return exp / wts[idx]; } return exp; }, stream); ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items - IdxT *outIdxPtr = outIdxBuff.data(); + IdxT* outIdxPtr = outIdxBuff.data(); raft::mr::device::buffer workspace(allocator, stream); - sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, - (int)len, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } scatter(out, in, outIdxPtr, sampledLen, stream); } @@ -558,17 +591,15 @@ class Rng { * @param[in] stream cuda stream * @{ */ - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { randImpl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { rand2Impl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } @@ -591,12 +622,10 @@ class Rng { static const int NumThreads = 256; template - uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, - int nThreads, int nBlocks) { + uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks) + { LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); - if (IsNormal && itemsPerThread % 2 == 1) { - ++itemsPerThread; - } + if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; } // curand uses 2 32b uint's to generate one double uint64_t factor = sizeof(Type) / sizeof(float); if (factor == 0) ++factor; @@ -604,22 +633,26 @@ class Rng { // If not, then generate new seed and start from zero offset uint64_t newOffset = offset + LenType(itemsPerThread) * factor; if (newOffset < offset) { - offset = 0; - seed = gen(); + offset = 0; + seed = gen(); newOffset = itemsPerThread * factor; } return newOffset; } - template - void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void randImpl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda randOp, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - uint64_t seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: randKernel @@ -633,26 +666,28 @@ class Rng { randKernel <<>>(seed, offset, ptr, len, randOp); break; - default: - ASSERT(false, "randImpl: Incorrect generator type! %d", type); + default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; } - template - void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void rand2Impl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda2 rand2Op, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - auto seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: @@ -660,12 +695,10 @@ class Rng { <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; - default: - ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh index d44c6f018b..485f4ddd68 100644 --- a/cpp/include/raft/random/rng_impl.cuh +++ b/cpp/include/raft/random/rng_impl.cuh @@ -33,7 +33,8 @@ struct PhiloxGenerator { * @param subsequence as found in curand docs * @param offset as found in curand docs */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { curand_init(seed, subsequence, offset, &state); } @@ -44,18 +45,21 @@ struct PhiloxGenerator { DI void next(float& ret) { ret = curand_uniform(&(this->state)); } DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } DI void next(uint32_t& ret) { ret = curand(&(this->state)); } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -76,8 +80,9 @@ struct TapsGenerator { * @param subsequence unused * @param offset unused */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; uint64_t stride = blockDim.x * gridDim.x; delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; stride *= blockDim.y * gridDim.y; @@ -90,31 +95,36 @@ struct TapsGenerator { * @{ */ template - DI void next(Type& ret) { + DI void next(Type& ret) + { constexpr double ULL_LARGE = 1.8446744073709551614e19; uint64_t val; next(val); ret = static_cast(val); ret /= static_cast(ULL_LARGE); } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; + constexpr int ROUNDS = 128; for (int i = 0; i < ROUNDS; i++) state = (state >> 1) ^ (-(state & 1ULL) & TAPS); ret = state; } - DI void next(uint32_t& ret) { + DI void next(uint32_t& ret) + { uint64_t val; next(val); ret = (uint32_t)val; } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -135,46 +145,49 @@ struct Kiss99Generator { * @param subsequence unused * @param offset unused */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - initKiss99(seed); - } + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); } /** * @defgroup NextRand Generate the next random number * @{ */ template - DI void next(Type& ret) { + DI void next(Type& ret) + { constexpr double U_LARGE = 4.294967295e9; uint32_t val; next(val); ret = static_cast(val); ret /= static_cast(U_LARGE); } - DI void next(uint32_t& ret) { + DI void next(uint32_t& ret) + { uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); MWC = ((z << 16) + w); jsr ^= (jsr << 17); jsr ^= (jsr >> 13); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -193,7 +206,8 @@ struct Kiss99Generator { // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t* h) { + DI void mulByFnv1a128Prime(uint32_t* h) + { typedef union { uint32_t u32[2]; uint64_t u64[1]; @@ -217,12 +231,12 @@ struct Kiss99Generator { // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); uint32_t carry = 0; - h[0] = h0p0.u32[0]; + h[0] = h0p0.u32[0]; - h[1] = h0p0.u32[1] + h1p0.u32[0]; + h[1] = h0p0.u32[1] + h1p0.u32[0]; carry = h[1] < h0p0.u32[1] ? 1 : 0; - h[2] = h1p0.u32[1] + carry; + h[2] = h1p0.u32[1] + carry; carry = h[2] < h1p0.u32[1] ? 1 : 0; h[2] += h2p0.u32[0]; carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; @@ -233,7 +247,8 @@ struct Kiss99Generator { return; } - DI void fnv1a128(uint32_t* hash, uint32_t txt) { + DI void fnv1a128(uint32_t* hash, uint32_t txt) + { hash[0] ^= (txt >> 0) & 0xFF; mulByFnv1a128Prime(hash); hash[0] ^= (txt >> 8) & 0xFF; @@ -244,7 +259,8 @@ struct Kiss99Generator { mulByFnv1a128Prime(hash); } - DI void initKiss99(uint64_t seed) { + DI void initKiss99(uint64_t seed) + { // Initialize hash to 128-bit FNV1a basis uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; @@ -259,9 +275,9 @@ struct Kiss99Generator { fnv1a128(hash, uint32_t(seed >> 32)); // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; + z = hash[0]; + w = hash[1]; + jsr = hash[2]; jcong = hash[3]; } }; @@ -273,10 +289,13 @@ struct Kiss99Generator { template struct Generator { DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) {} + : gen(seed, subsequence, offset) + { + } template - DI void next(Type& ret) { + DI void next(Type& ret) + { gen.next(ret); } diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh index e367550060..5d38bdf4a8 100644 --- a/cpp/include/raft/sparse/convert/coo.cuh +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -37,14 +37,18 @@ namespace sparse { namespace convert { template -__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, - value_idx *coo_rows, value_idx nnz) { +__global__ void csr_to_coo_kernel(const value_idx* row_ind, + value_idx m, + value_idx* coo_rows, + value_idx nnz) +{ // row-based matrix 1 thread per row value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { value_idx start_idx = row_ind[row]; - value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); - for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) + coo_rows[i] = row; } } @@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, * @param stream: cuda stream to use */ template -void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, - value_idx nnz, cudaStream_t stream) { +void csr_to_coo( + const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream) +{ // @TODO: Use cusparse for this. dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_to_coo_kernel - <<>>(row_ind, m, coo_rows, nnz); + csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index a034bdbda8..2191f5edd1 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -44,29 +44,33 @@ namespace sparse { namespace convert { template -void coo_to_csr(const raft::handle_t &handle, const int *srcRows, - const int *srcCols, const value_t *srcVals, int nnz, int m, - int *dst_offsets, int *dstCols, value_t *dstVals) { - auto stream = handle.get_stream(); +void coo_to_csr(const raft::handle_t& handle, + const int* srcRows, + const int* srcCols, + const value_t* srcVals, + int nnz, + int m, + int* dst_offsets, + int* dstCols, + value_t* dstVals) +{ + auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); + auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer dstRows(d_alloc, stream, nnz); - CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); raft::mr::device::buffer P(d_alloc, stream, nnz); - CUSPARSE_CHECK( - cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); - raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), - dstCols, P.data(), pBuffer.data(), stream); - raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), - stream); - raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, - dst_offsets, stream); + CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow( + cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -85,14 +89,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, * @param stream cuda stream to use * @param fused_op: the fused operation */ -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ op::csr_row_op( - row_ind, batchSize, nnz, + row_ind, + batchSize, + nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( Index_ row, Index_ start_idx, Index_ stop_idx) { fused_op(row, start_idx, stop_idx); @@ -108,14 +118,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, stream); } -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched( - row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, - [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream) +{ + csr_adj_graph_batched(row_ind, + total_rows, + nnz, + batchSize, + adj, + row_ind_ptr, + stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); } /** @@ -131,13 +150,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream cuda stream to use * @param fused_op the fused operation */ -template void> -void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { - csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, - adj, row_ind_ptr, stream, fused_op); +template void> +void csr_adj_graph(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ + csr_adj_graph_batched( + row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op); } /** @@ -151,9 +174,13 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, +void sorted_coo_to_csr(const T* rows, + int nnz, + T* row_ind, + int m, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer row_counts(d_alloc, stream, m); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -161,11 +188,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); } /** @@ -177,11 +202,12 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, +void sorted_coo_to_csr(COO* coo, + int* row_ind, std::shared_ptr d_alloc, - cudaStream_t stream) { - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, - stream); + cudaStream_t stream) +{ + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh index 299f9d36d4..e90882b501 100644 --- a/cpp/include/raft/sparse/convert/dense.cuh +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -37,22 +37,20 @@ namespace sparse { namespace convert { template -__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, - const value_t *csrVal, - const int *csrRowPtr, - const int *csrColInd, - value_t *a) { +__global__ void csr_to_dense_warp_per_row_kernel( + int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a) +{ int row = blockIdx.x; int tid = threadIdx.x; int colStart = csrRowPtr[row]; - int colEnd = csrRowPtr[row + 1]; - int rowNnz = colEnd - colStart; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; for (int i = tid; i < rowNnz; i += blockDim.x) { int colIdx = colStart + i; if (colIdx < colEnd) { - int col = csrColInd[colIdx]; + int col = csrColInd[colIdx]; a[row * n_cols + col] = csrVal[colIdx]; } } @@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols, * @param[in] row_major : Is row-major output desired? */ template -void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data, value_idx lda, value_t *out, - cudaStream_t stream, bool row_major = true) { +void csr_to_dense(cusparseHandle_t handle, + value_idx nrows, + value_idx ncols, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx lda, + value_t* out, + cudaStream_t stream, + bool row_major = true) +{ if (!row_major) { /** * If we need col-major, use cusparse. @@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( - handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, - lda, stream)); + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK( - cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 73120fea8c..348ed5eab2 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -68,83 +68,87 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers - * @param stream: CUDA stream to use - */ + * @param d_alloc: the device allocator to use for the underlying buffers + * @param stream: CUDA stream to use + */ COO(std::shared_ptr d_alloc, cudaStream_t stream) : rows_arr(d_alloc, stream, 0), cols_arr(d_alloc, stream, 0), vals_arr(d_alloc, stream, 0), nnz(0), n_rows(0), - n_cols(0) {} + n_cols(0) + { + } /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - COO(raft::mr::device::buffer &rows, - raft::mr::device::buffer &cols, - raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(raft::mr::device::buffer& rows, + raft::mr::device::buffer& cols, + raft::mr::device::buffer& vals, + Index_Type nnz, + Index_Type n_rows = 0, Index_Type n_cols = 0) - : rows_arr(rows), - cols_arr(cols), - vals_arr(vals), - nnz(nnz), - n_rows(n_rows), - n_cols(n_cols) {} + : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols) + { + } /** - * @param d_alloc: the device allocator use - * @param stream: CUDA stream to use - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays with zeros - */ - COO(std::shared_ptr d_alloc, cudaStream_t stream, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, - bool init = true) + * @param d_alloc: the device allocator use + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(std::shared_ptr d_alloc, + cudaStream_t stream, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0, + bool init = true) : rows_arr(d_alloc, stream, nnz), cols_arr(d_alloc, stream, nnz), vals_arr(d_alloc, stream, nnz), nnz(nnz), n_rows(n_rows), - n_cols(n_cols) { + n_cols(n_cols) + { if (init) init_arrays(stream); } - void init_arrays(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK( - cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + void init_arrays(cudaStream_t stream) + { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const { + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const + { if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; return true; } /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const { - if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || - this->vals_arr.size() == 0) { + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const + { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) { return false; } @@ -154,33 +158,30 @@ class COO { /* * @brief Returns the rows array */ - Index_Type *rows() { return this->rows_arr.data(); } + Index_Type* rows() { return this->rows_arr.data(); } /** * @brief Returns the cols array */ - Index_Type *cols() { return this->cols_arr.data(); } + Index_Type* cols() { return this->cols_arr.data(); } /** * @brief Returns the vals array */ - T *vals() { return this->vals_arr.data(); } + T* vals() { return this->vals_arr.data(); } /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream &operator<<(std::ostream &out, - const COO &c) { + * @brief Send human-readable state information to output stream + */ + friend std::ostream& operator<<(std::ostream& out, const COO& c) + { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) - << std::endl; - out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) - << std::endl; - out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) - << std::endl; + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl; out << "nnz=" << c.nnz << std::endl; out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; @@ -194,58 +195,59 @@ class COO { } /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) { + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) + { this->n_rows = n_rows; this->n_cols = n_cols; } /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) { + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) + { this->n_rows = n; this->n_cols = n; } /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, bool init, cudaStream_t stream) { - this->allocate(nnz, 0, init, stream); - } + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, int size, bool init, cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) + { this->allocate(nnz, size, size, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param init: should values be initialized to 0? - * @param stream: stream to use for init - */ - void allocate(int nnz, int n_rows, int n_cols, bool init, - cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream) + { this->n_rows = n_rows; this->n_cols = n_cols; - this->nnz = nnz; + this->nnz = nnz; this->rows_arr.resize(this->nnz, stream); this->cols_arr.resize(this->nnz, stream); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index bc4a68d296..17f3c735af 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -41,57 +41,64 @@ namespace sparse { struct WeakCCState { public: - bool *m; - WeakCCState(bool *m) : m(m) {} + bool* m; + WeakCCState(bool* m) : m(m) {} }; template -__global__ void weak_cc_label_device(Index_ *__restrict__ labels, - const Index_ *__restrict__ row_ind, - const Index_ *__restrict__ row_ind_ptr, - Index_ nnz, bool *__restrict__ m, - Index_ start_vertex_id, Index_ batch_size, - Index_ N, Lambda filter_op) { - Index_ tid = threadIdx.x + blockIdx.x * TPB_X; +__global__ void weak_cc_label_device(Index_* __restrict__ labels, + const Index_* __restrict__ row_ind, + const Index_* __restrict__ row_ind_ptr, + Index_ nnz, + bool* __restrict__ m, + Index_ start_vertex_id, + Index_ batch_size, + Index_ N, + Lambda filter_op) +{ + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; Index_ global_id = tid + start_vertex_id; if (tid < batch_size && global_id < N) { Index_ start = __ldg(row_ind + tid); Index_ ci, cj; - bool ci_mod = false; - ci = labels[global_id]; + bool ci_mod = false; + ci = labels[global_id]; bool ci_allow_prop = filter_op(global_id); Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); /// TODO: add one element to row_ind and avoid get_stop_idx for (Index_ j = start; j < end; j++) { - Index_ j_ind = __ldg(row_ind_ptr + j); - cj = labels[j_ind]; + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; bool cj_allow_prop = filter_op(j_ind); if (ci < cj && ci_allow_prop) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + j_ind), ci); + atomicMin((int*)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + j_ind), ci); + atomicMin((long long int*)(labels + j_ind), ci); if (cj_allow_prop) *m = true; } else if (ci > cj && cj_allow_prop) { - ci = cj; + ci = cj; ci_mod = true; } } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + global_id), ci); + atomicMin((int*)(labels + global_id), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + global_id), ci); + atomicMin((long long int*)(labels + global_id), ci); if (ci_allow_prop) *m = true; } } } template -__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, - Index_ MAX_LABEL, Lambda filter_op) { +__global__ void weak_cc_init_all_kernel(Index_* labels, + Index_ N, + Index_ MAX_LABEL, + Lambda filter_op) +{ Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (filter_op(tid)) @@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream, - Lambda filter_op) { - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, - "Index_ should be 4 or 8 bytes"); +template bool> +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream, + Lambda filter_op) +{ + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes"); bool host_m; Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel - <<>>( - labels, N, MAX_LABEL, filter_op); + <<>>(labels, N, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); int n_iters = 0; @@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, weak_cc_label_device <<>>( - labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, - batch_size, N, filter_op); + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); CUDA_CHECK(cudaPeekAtLastError()); //** Updating m * @@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param stream the cuda stream to use */ template -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream) { - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, - batch_size, state, stream, +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream) +{ + weak_cc_batched(labels, + row_ind, + row_ind_ptr, + nnz, + N, + start_vertex_id, + batch_size, + state, + stream, [] __device__(Index_ tid) { return true; }); } @@ -213,17 +235,20 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, +template bool> +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream, Lambda filter_op) { + cudaStream_t stream, + Lambda filter_op) +{ raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, filter_op); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); } /** @@ -249,14 +274,18 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param stream the cuda stream to use */ template -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, [](Index_) { return true; }); + weak_cc_batched( + labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); } }; // namespace sparse diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 360832f557..9d42ec34cb 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -23,10 +23,9 @@ //#include #define _CUSPARSE_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; -//Notes: +// Notes: //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic; //(2.) to enforce a lower version, // @@ -43,16 +42,15 @@ namespace raft { * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) - : raft::exception(message) {} - explicit cusparse_error(std::string const& message) - : raft::exception(message) {} + explicit cusparse_error(char const* const message) : raft::exception(message) {} + explicit cusparse_error(std::string const& message) : raft::exception(message) {} }; namespace sparse { namespace detail { -inline const char* cusparse_error_to_string(cusparseStatus_t err) { +inline const char* cusparse_error_to_string(cusparseStatus_t err) +{ #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(err); #else // CUDART_VERSION @@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: - return "CUSPARSE_STATUS_UNKNOWN"; + default: return "CUSPARSE_STATUS_UNKNOWN"; }; #endif // CUDART_VERSION } @@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::sparse::detail::cusparse_error_to_string(status)); \ throw raft::cusparse_error(msg); \ } \ @@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ - do { \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ - } \ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -117,28 +119,34 @@ namespace sparse { * @{ */ template -cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, - T* vals_sorted, int* d_P, cudaStream_t stream); +cusparseStatus_t cusparsegthr( + cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const double* vals, double* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const double* vals, + double* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const float* vals, float* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const float* vals, + float* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } /** @} */ @@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, * @{ */ template -void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, - int m, T* csrRowPtr, cudaStream_t stream); +void cusparsecoo2csr( + cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream); template <> -inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, - int nnz, int m, int* csrRowPtr, - cudaStream_t stream) { +inline void cusparsecoo2csr(cusparseHandle_t handle, + const int* cooRowInd, + int nnz, + int m, + int* csrRowPtr, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, */ template size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, - const T* cooCols, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* cooRows, + const T* cooCols, + cudaStream_t stream); template <> inline size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, - const int* cooCols, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cooRows, + const int* cooCols, + cudaStream_t stream) +{ size_t val; CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); return val; } template void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, - void* pBuffer, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + T* cooRows, + T* cooCols, + T* P, + void* pBuffer, + cudaStream_t stream); template <> inline void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, - int* P, void* pBuffer, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRows, + int* cooCols, + int* P, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); } /** @} */ @@ -199,37 +234,67 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, - const T* A, int lda, const T* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const T* alpha, + const T* A, + int lda, + const T* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const float* alpha, - const float* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float* alpha, + const float* A, + int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, - float* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseSgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const double* alpha, - const double* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double* alpha, + const double* A, + int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, - double* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseDgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } /** @} */ @@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, */ template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, int64_t nnz, - IndexT* csrRowOffsets, IndexT* csrColInd, + int64_t rows, + int64_t cols, + int64_t nnz, + IndexT* csrRowOffsets, + IndexT* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } /** @} */ @@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, * @{ */ template -cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, T* values); +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values); template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, float* values) { + int64_t size, + float* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, double* values) { + int64_t size, + double* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); } /** @} */ @@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, */ template cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, int64_t ld, - T* values, cusparseOrder_t order); + int64_t rows, + int64_t cols, + int64_t ld, + T* values, + cusparseOrder_t order); template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, float* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + float* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order); } template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, double* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + double* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order); } /** @} */ @@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, * @{ */ template -cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_32F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_64F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize); } template -cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, const T* beta, +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, T* externalBuffer, + cusparseSpMVAlg_t alg, + T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - double* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, - float* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, const double* x, - const double* beta, double* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } /** @} */ #endif @@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv( * @{ */ template -cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_32F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_64F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize); } template -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + T* externalBuffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_32F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_64F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm( */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, - const T* beta, T* y, const int ldy, cudaStream_t stream); + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const float* alpha, const cusparseMatDescr_t descr, - const float* csrVal, const int* csrRowPtr, const int* csrColInd, - const float* x, const int ldx, const float* beta, float* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const double* alpha, const cusparseMatDescr_t descr, - const double* csrVal, const int* csrRowPtr, const int* csrColInd, - const double* x, const int ldx, const double* beta, double* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ #endif @@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm( */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, - T* cooRowInd, cudaStream_t stream); + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, - const int* csrRowPtr, int* cooRowInd, - cudaStream_t stream) { +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSetPointerMode(handle, mode); } @@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, - CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, - CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); -} -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, - CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, - CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); +cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + size_t* bufferSizeInBytes, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + bufferSizeInBytes); } template -cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, T* buffer, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, float* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, - descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, - CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, - CUDA_R_32F, buffer); -} -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, double* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, - descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, - CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, - CUDA_R_64F, buffer); +cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + T* buffer, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + float* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + double* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + buffer); } /** @} */ @@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex( */ template -cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + bufferSize); } template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + bufferSize); } template -cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_32F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + buffer); } template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_64F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + buffer); } /** @} */ @@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc( */ template -cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta, - const cusparseMatDescr_t matA, int nnzA, const int* rowindA, - const int* indicesA, const cusparseMatDescr_t matB, int nnzB, - const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD, - int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info, - size_t* pBufferSizeInBytes, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const float* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const T* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const float* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseScsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const double* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const double* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseDcsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } -inline cusparseStatus_t cusparsecsrgemm2nnz( - cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA, - int nnzA, const int* rowindA, const int* indicesA, - const cusparseMatDescr_t matB, int nnzB, const int* rowindB, - const int* indicesB, const cusparseMatDescr_t matD, int nnzD, - const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC, - int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + const cusparseMatDescr_t matC, + int* rowindC, + int* nnzC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA, - matB, nnzB, rowindB, indicesB, matD, nnzD, - rowindD, indicesD, matC, rowindC, nnzC, info, + return cusparseXcsrgemm2Nnz(handle, + m, + n, + k, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + matD, + nnzD, + rowindD, + indicesD, + matC, + rowindC, + nnzC, + info, pBuffer); #pragma GCC diagnostic pop } template -cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, - const cusparseMatDescr_t descrA, int nnzA, const T* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD, - const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC, - T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info, - void* pBuffer, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const cusparseMatDescr_t descrA, int nnzA, const float* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const float* beta, const cusparseMatDescr_t descrD, int nnzD, - const float* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const T* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const T* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const T* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + T* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + float* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseScsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const cusparseMatDescr_t descrA, int nnzA, const double* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const double* beta, const cusparseMatDescr_t descrD, int nnzD, - const double* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + double* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseDcsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } @@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2( */ template -cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, - const int* csrColIndA, T* A, int lda, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + T* A, + int lda, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, - const int* csrColIndA, float* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + float* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, - const int* csrColIndA, double* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + double* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } /** @} */ diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index f3109556b7..aef19122da 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -37,9 +37,11 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_binary_row_norm_kernel( - value_t *out, const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, value_idx nnz) { +__global__ void compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; if (i < nnz) { // We do conditional here only because it's @@ -51,55 +53,64 @@ __global__ void compute_binary_row_norm_kernel( } template -__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, - const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t q_norm = Q_norms[i]; - value_t r_norm = R_norms[j]; - value_t dot = C[(size_t)i * n_cols + j]; + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); } -template -void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func, cudaStream_t stream) { +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_binary_warp_kernel<<>>( C, Q_norms, R_norms, n_rows, n_cols, expansion_func); } -template -void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_binary_row_norm_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, - stream); + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); } /** @@ -109,44 +120,52 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, template class jaccard_expanded_distances_t : public distances_t { public: - explicit jaccard_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t denom = q_r_union - dot; - - value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); - - // flip the similarity when both rows are 0 - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * jacc) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); } ~jaccard_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -158,40 +177,48 @@ class jaccard_expanded_distances_t : public distances_t { template class dice_expanded_distances_t : public distances_t { public: - explicit dice_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t dice = (2 * dot) / q_r_union; - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * dice) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); } ~dice_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 1c55412eec..29c823bcdb 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -24,31 +24,31 @@ namespace distance { template struct distances_config_t { - distances_config_t(const raft::handle_t &handle_) : handle(handle_) {} + distances_config_t(const raft::handle_t& handle_) : handle(handle_) {} // left side value_idx a_nrows; value_idx a_ncols; value_idx a_nnz; - value_idx *a_indptr; - value_idx *a_indices; - value_t *a_data; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; // right side value_idx b_nrows; value_idx b_ncols; value_idx b_nnz; - value_idx *b_indptr; - value_idx *b_indices; - value_t *b_data; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; - const raft::handle_t &handle; + const raft::handle_t& handle; }; template class distances_t { public: - virtual void compute(value_t *out) {} + virtual void compute(value_t* out) {} virtual ~distances_t() = default; }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index 3a78f9ada0..cdf1be0c68 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -41,19 +41,29 @@ namespace raft { namespace sparse { namespace distance { -template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); - - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, - chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; /** @@ -89,39 +99,55 @@ inline void balanced_coo_pairwise_generalized_spmv( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); if (max_cols > config_.a_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } }; -template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); }; /** @@ -160,24 +186,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ // try dense first int max_cols = max_cols_per_block(); if (max_cols > config_.b_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 5ace978a23..7a83e73183 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -32,58 +32,114 @@ namespace distance { template class coo_spmv_strategy { public: - coo_spmv_strategy(const distances_config_t &config_) - : config(config_) { + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { smem = raft::getSharedMemPerBlock(); } - template - void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr, - value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size, int n_blocks, - int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz, - coo_rows_b, config.b_indices, config.b_data, config.a_nrows, - config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row, - chunk_size, config.b_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); } - template - void _dispatch_base_rev(strategy_t &strategy, int smem_dim, - indptr_it &b_indptr, value_t *out_dists, - value_idx *coo_rows_a, product_f product_func, - accum_f accum_func, write_f write_func, - int chunk_size, int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz, - coo_rows_a, config.a_indices, config.a_data, config.b_nrows, - config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row, - chunk_size, config.a_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); } protected: int smem; - const distances_config_t &config; + const distances_config_t& config; }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 44c3833f96..6586067b56 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -29,11 +29,15 @@ namespace distance { template class mask_row_it { public: - mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_ = NULL) - : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {} + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { if (mask_row_idx != NULL) { return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; } else { @@ -41,37 +45,49 @@ class mask_row_it { } } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { start_offset = full_indptr[row_idx]; - stop_offset = full_indptr[row_idx + 1] - 1; + stop_offset = full_indptr[row_idx + 1] - 1; } - __device__ constexpr inline void get_indices_boundary( - const value_idx *indices, value_idx &indices_len, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { // do nothing; } - __device__ constexpr inline bool check_indices_bounds( - value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) { + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return true; } const value_idx *full_indptr, &n_rows; - value_idx *mask_row_idx; + value_idx* mask_row_idx; }; template -__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, - value_idx *chunk_indices, - value_idx n_rows) { +__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n_rows) { auto start = n_chunks_per_row[tid]; - auto end = n_chunks_per_row[tid + 1]; + auto end = n_chunks_per_row[tid + 1]; #pragma unroll for (int i = start; i < end; i++) { @@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, template class chunked_mask_row_it : public mask_row_it { public: - chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_, int row_chunk_size_, - const value_idx *n_chunks_per_row_, - const value_idx *chunk_indices_, + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, const cudaStream_t stream_) : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), row_chunk_size(row_chunk_size_), n_chunks_per_row(n_chunks_per_row_), chunk_indices(chunk_indices_), - stream(stream_) {} + stream(stream_) + { + } - static void init(const value_idx *indptr, const value_idx *mask_row_idx, - const value_idx &n_rows, const int row_chunk_size, - rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, - cudaStream_t stream) { + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); constexpr value_idx first_element = 0; n_chunks_per_row.set_element_async(0, first_element, stream); n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); - thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows, - n_chunks_per_row.begin() + 1, chunk_functor); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); - thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1, - n_chunks_per_row.end(), - n_chunks_per_row.begin() + 1); + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); - raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, - stream); + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { - auto chunk_index = blockIdx.x / n_blocks_nnz_b; - auto chunk_val = chunk_indices[chunk_index]; - auto prev_n_chunks = n_chunks_per_row[chunk_val]; + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; auto relative_chunk = chunk_index - prev_n_chunks; - first_a_chunk = relative_chunk == 0; + first_a_chunk = relative_chunk == 0; start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; - stop_offset = start_offset + row_chunk_size; + stop_offset = start_offset + row_chunk_size; auto final_stop_offset = this->full_indptr[row_idx + 1]; last_a_chunk = stop_offset >= final_stop_offset; - stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; } - __device__ inline void get_indices_boundary( - const value_idx *indices, value_idx &row_idx, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; - stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; } - __device__ inline bool check_indices_bounds(value_idx &start_index_a, - value_idx &stop_index_a, - value_idx &index_b) { + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return (index_b >= start_index_a && index_b <= stop_index_a); } @@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it { struct n_chunks_per_row_functor { public: - n_chunks_per_row_functor(const value_idx *indptr_, - value_idx row_chunk_size_) - : indptr(indptr_), row_chunk_size(row_chunk_size_) {} + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } - __host__ __device__ value_idx operator()(const value_idx &i) { + __host__ __device__ value_idx operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return raft::ceildiv(degree, (value_idx)row_chunk_size); } - const value_idx *indptr; + const value_idx* indptr; value_idx row_chunk_size; }; private: - static void fill_chunk_indices( - const value_idx &n_rows, rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, cudaStream_t stream) { + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto n_threads = std::min(n_rows, 256); - auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); chunk_indices.resize(total_row_blocks, stream); - fill_chunk_indices_kernel<<>>( - n_chunks_per_row.data(), chunk_indices.data(), n_rows); + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh index c463654a3b..aac98d6b02 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh @@ -25,71 +25,91 @@ namespace distance { template class dense_smem_strategy : public coo_spmv_strategy { public: - using smem_type = value_t *; + using smem_type = value_t*; using insert_type = smem_type; - using find_type = smem_type; + using find_type = smem_type; - dense_smem_strategy(const distances_config_t &config_) - : coo_spmv_strategy(config_) {} + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } - inline static int smem_per_block(int n_cols) { - return (n_cols * sizeof(value_t)) + - ((1024 / raft::warp_size()) * sizeof(value_t)); + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.b_nnz, chunk_size * 1024); - auto n_blocks = this->config.a_nrows * n_blocks_per_row; - - mask_row_it a_indptr(this->config.a_indptr, - this->config.a_nrows); - - this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists, - coo_rows_b, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.a_nnz, chunk_size * 1024); - auto n_blocks = this->config.b_nrows * n_blocks_per_row; - - mask_row_it b_indptr(this->config.b_indptr, - this->config.b_nrows); - - this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists, - coo_rows_a, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { cache[k] = 0.0; } return cache; } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { cache[key] = value; } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return cache; } - __device__ inline value_t find(find_type cache, const value_idx &key) { - return cache[key]; - } + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 1295d24103..3f8f4b21ad 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -1,18 +1,18 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -38,177 +38,238 @@ template class hash_strategy : public coo_spmv_strategy { public: using insert_type = - typename cuco::static_map::device_mutable_view; - using smem_type = typename insert_type::slot_type *; + typename cuco::static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; using find_type = - typename cuco::static_map::device_view; + typename cuco::static_map::device_view; - hash_strategy(const distances_config_t &config_, - float capacity_threshold_ = 0.5, int map_size_ = get_map_size()) + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) : coo_spmv_strategy(config_), capacity_threshold(capacity_threshold_), - map_size(map_size_) {} + map_size(map_size_) + { + } - void chunking_needed(const value_idx *indptr, const value_idx n_rows, - rmm::device_uvector &mask_indptr, - std::tuple &n_rows_divided, - cudaStream_t stream) { + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); - auto less = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), mask_indptr.data(), - fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); std::get<0>(n_rows_divided) = less - mask_indptr.data(); auto more = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), less, - fits_in_hash_table(indptr, capacity_threshold * map_size, - std::numeric_limits::max())); + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); std::get<1>(n_rows_divided) = more - less; } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.a_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.a_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.a_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.a_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.a_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.b_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.b_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.b_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.b_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.b_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { return insert_type::make_from_uninitialized_slots( cooperative_groups::this_thread_block(), cache, cache_size, -1, 0); } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { auto success = cache.insert(cuco::pair(key, value)); } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return find_type(cache, cache_size, -1, 0); } - __device__ inline value_t find(find_type cache, const value_idx &key) { + __device__ inline value_t find(find_type cache, const value_idx& key) + { auto a_pair = cache.find(key); value_t a_col = 0.0; - if (a_pair != cache.end()) { - a_col = a_pair->second; - } + if (a_pair != cache.end()) { a_col = a_pair->second; } return a_col; } struct fits_in_hash_table { public: - fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_, - value_idx degree_r_) - : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {} + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } - __host__ __device__ bool operator()(const value_idx &i) { + __host__ __device__ bool operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return degree >= degree_l && degree < degree_r; } private: - const value_idx *indptr; + const value_idx* indptr; const value_idx degree_l, degree_r; }; - inline static int get_map_size() { - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(typename insert_type::slot_type); } diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh index 51f9a05394..b12252ab25 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh @@ -27,68 +27,88 @@ namespace sparse { namespace distance { /** - * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with - * sparse-matrix-sparse-vector multiplication layout (SPMV). - * This is intended to be scheduled n_chunks_b times for each row of a. - * The steps are as follows: - * - * 1. Load row from A into dense vector in shared memory. - * This can be further chunked in the future if necessary to support larger - * column sizes. - * 2. Threads of block all step through chunks of B in parallel. - * When a new row is encountered in row_indices_b, a segmented - * reduction is performed across the warps and then across the - * block and the final value written out to host memory. - * - * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf - * - * @tparam value_idx index type - * @tparam value_t value type - * @tparam tpb threads per block configured on launch - * @tparam rev if this is true, the reduce/accumulate functions are only - * executed when A[col] == 0.0. when executed before/after !rev - * and A & B are reversed, this allows the full symmetric difference - * and intersection to be computed. - * @tparam kv_t data type stored in shared mem cache - * @tparam product_f reduce function type (semiring product() function). - * accepts two arguments of value_t and returns a value_t - * @tparam accum_f accumulation function type (semiring sum() function). - * accepts two arguments of value_t and returns a value_t - * @tparam write_f function to write value out. this should be mathematically - * equivalent to the accumulate function but implemented as - * an atomic operation on global memory. Accepts two arguments - * of value_t* and value_t and updates the value given by the - * pointer. - * @param[in] indptrA column pointer array for A - * @param[in] indicesA column indices array for A - * @param[in] dataA data array for A - * @param[in] rowsB coo row array for B - * @param[in] indicesB column indices array for B - * @param[in] dataB data array for B - * @param[in] m number of rows in A - * @param[in] n number of rows in B - * @param[in] dim number of features - * @param[in] nnz_b number of nonzeros in B - * @param[out] out array of size m*n - * @param[in] n_blocks_per_row number of blocks of B per row of A - * @param[in] chunk_size number of nnz for B to use for each row of A - * @param[in] buffer_size amount of smem to use for each row of A - * @param[in] product_func semiring product() function - * @param[in] accum_func semiring sum() function - * @param[in] write_func atomic semiring sum() function - */ -template -__global__ void balanced_coo_generalized_spmv_kernel( - strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA, - value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB, - value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out, - int n_blocks_per_row, int chunk_size, value_idx b_ncols, - product_f product_func, accum_f accum_func, write_f write_func) { + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ typedef cub::WarpReduce warp_reduce; - value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; // chunk starting offset @@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel( // how many total cols will be processed by this block (should be <= chunk_size * n_threads) value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); - int tid = threadIdx.x; + int tid = threadIdx.x; int warp_id = tid / raft::warp_size(); // compute id relative to current warp unsigned int lane_id = tid & (raft::warp_size() - 1); - value_idx ind = ind_offset + threadIdx.x; + value_idx ind = ind_offset + threadIdx.x; extern __shared__ char smem[]; - typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); - typename warp_reduce::TempStorage *temp_storage = - (typename warp_reduce::TempStorage *)(A + dim); + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); auto inserter = strategy.init_insert(A, dim); @@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( value_idx start_offset_a, stop_offset_a; bool first_a_chunk, last_a_chunk; - indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a, - n_blocks_per_row, first_a_chunk, last_a_chunk); + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); // Convert current row vector in A to dense for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { - strategy.insert(inserter, indicesA[start_offset_a + i], - dataA[start_offset_a + i]); + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); } __syncthreads(); @@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (ind >= nnz_b) return; value_idx start_index_a = 0, stop_index_a = b_ncols - 1; - indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a, - stop_offset_a, start_index_a, stop_index_a, - first_a_chunk, last_a_chunk); + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); value_idx cur_row_b = -1; - value_t c = 0.0; + value_t c = 0.0; auto warp_red = warp_reduce(*(temp_storage + warp_id)); if (tid < active_chunk_size) { cur_row_b = rowsB[ind]; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = product_func(a_col, dataB[ind]); - } + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } } } // loop through chunks in parallel, reducing when a new row is // encountered by each thread for (int i = tid; i < active_chunk_size; i += blockDim.x) { - value_idx ind_next = ind + blockDim.x; + value_idx ind_next = ind + blockDim.x; value_idx next_row_b = -1; if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; @@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel( // grab the threads currently participating in loops. // because any other threads should have returned already. unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); - bool is_leader = get_lowest_peer(peer_group) == lane_id; - value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); // thread with lowest lane id among peers writes out if (is_leader && v != 0.0) { // this conditional should be uniform, since rev is constant - size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b - : (size_t)cur_row_b * m + cur_row_a; + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; write_func(out + idx, v); } @@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (next_row_b != -1) { ind = ind_next; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = accum_func(c, product_func(a_col, dataB[ind])); - } + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } } cur_row_b = next_row_b; diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index a1974b3666..228a62ed7a 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -74,16 +74,17 @@ static const std::unordered_set supportedDistance{ * @param[in] metric distance metric to use */ template -void pairwiseDistance(value_t *out, +void pairwiseDistance(value_t* out, distances_config_t input_config, - raft::distance::DistanceType metric, float metric_arg) { + raft::distance::DistanceType metric, + float metric_arg) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: l2_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtExpanded: - l2_sqrt_expanded_distances_t(input_config) - .compute(out); + l2_sqrt_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::InnerProduct: ip_distances_t(input_config).compute(out); @@ -92,62 +93,49 @@ void pairwiseDistance(value_t *out, l2_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - l2_sqrt_unexpanded_distances_t(input_config) - .compute(out); + l2_sqrt_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L1: l1_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::LpUnexpanded: - lp_unexpanded_distances_t(input_config, metric_arg) - .compute(out); + lp_unexpanded_distances_t(input_config, metric_arg).compute(out); break; case raft::distance::DistanceType::Linf: - linf_unexpanded_distances_t(input_config) - .compute(out); + linf_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::Canberra: - canberra_unexpanded_distances_t(input_config) - .compute(out); + canberra_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JaccardExpanded: - jaccard_expanded_distances_t(input_config) - .compute(out); + jaccard_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CosineExpanded: - cosine_expanded_distances_t(input_config) - .compute(out); + cosine_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HellingerExpanded: - hellinger_expanded_distances_t(input_config) - .compute(out); + hellinger_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::DiceExpanded: dice_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CorrelationExpanded: - correlation_expanded_distances_t(input_config) - .compute(out); + correlation_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::RusselRaoExpanded: - russelrao_expanded_distances_t(input_config) - .compute(out); + russelrao_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HammingUnexpanded: - hamming_unexpanded_distances_t(input_config) - .compute(out); + hamming_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JensenShannon: - jensen_shannon_unexpanded_distances_t(input_config) - .compute(out); + jensen_shannon_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::KLDivergence: - kl_divergence_unexpanded_distances_t(input_config) - .compute(out); + kl_divergence_unexpanded_distances_t(input_config).compute(out); break; - default: - THROW("Unsupported distance: %d", metric); + default: THROW("Unsupported distance: %d", metric); } } diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 882ccba027..8d77f9f5b5 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -45,10 +45,13 @@ class ip_distances_t : public distances_t { * Computes simple sparse inner product distances as sum(x_y * y_k) * @param[in] config specifies inputs, outputs, and sizes */ - ip_distances_t(const distances_config_t &config) - : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) { - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows_b.data(), config_->b_nnz, + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, config_->handle.get_stream()); } @@ -56,21 +59,21 @@ class ip_distances_t : public distances_t { * Performs pairwise distance computation and computes output distances * @param out_distances dense output matrix (size a_nrows * b_nrows) */ - void compute(value_t *out_distances) { + void compute(value_t* out_distances) + { /** - * Compute pairwise distances and return dense matrix in row-major format - */ + * Compute pairwise distances and return dense matrix in row-major format + */ balanced_coo_pairwise_generalized_spmv( - out_distances, *config_, coo_rows_b.data(), Product(), Sum(), - AtomicAdd()); + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd()); } - value_idx *b_rows_coo() { return coo_rows_b.data(); } + value_idx* b_rows_coo() { return coo_rows_b.data(); } - value_t *b_data_coo() { return config_->b_data; } + value_t* b_data_coo() { return config_->b_data; } private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector coo_rows_b; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 8886d4c9df..a9a2d1ee91 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -41,35 +41,36 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_row_norm_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i] * data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } } template -__global__ void compute_row_sum_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } } template -__global__ void compute_euclidean_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; @@ -83,25 +84,29 @@ __global__ void compute_euclidean_warp_kernel( } template -__global__ void compute_correlation_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols, - value_idx n) { +__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t dot = C[(size_t)i * n_cols + j]; + value_t dot = C[(size_t)i * n_cols + j]; value_t Q_l1 = Q_norms[i]; value_t R_l1 = R_norms[j]; value_t Q_l2 = Q_sq_norms[i]; value_t R_l2 = R_sq_norms[j]; - value_t numer = n * dot - (Q_l1 * R_l1); + value_t numer = n * dot - (Q_l1 * R_l1); value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); @@ -111,58 +116,77 @@ __global__ void compute_correlation_warp_kernel( C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); } -template -void compute_euclidean(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, value_idx n_rows, - value_idx n_cols, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_euclidean_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); } -template -void compute_l2(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_row_norm_kernel<<>>( R_sq_norms.data(), R_coo_rows, R_data, R_nnz); - compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, - expansion_func); + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); } template -void compute_correlation(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, const value_t *Q_norms, - const value_t *R_norms, value_idx n_rows, - value_idx n_cols, value_idx n, cudaStream_t stream) { +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_correlation_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); } template -void compute_corr(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, std::shared_ptr alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); @@ -171,15 +195,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -191,8 +211,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, compute_row_sum_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(), - R_norms.data(), m, n, n_cols, stream); + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); } /** @@ -202,35 +229,45 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, template class l2_expanded_distances_t : public distances_t { public: - explicit l2_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - return -2 * dot + q_norm + r_norm; - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); } ~l2_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; @@ -239,18 +276,21 @@ class l2_expanded_distances_t : public distances_t { * The expanded form is more efficient for sparse data. */ template -class l2_sqrt_expanded_distances_t - : public l2_expanded_distances_t { +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { public: - explicit l2_sqrt_expanded_distances_t( - const distances_config_t &config) - : l2_expanded_distances_t(config) {} + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } - void compute(value_t *out_dists) override { + void compute(value_t* out_dists) override + { l2_expanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -264,25 +304,35 @@ class l2_sqrt_expanded_distances_t template class correlation_expanded_distances_t : public distances_t { public: - explicit correlation_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_corr(out_dists, search_coo_rows.data(), config_->a_data, - config_->a_nnz, b_indices, b_data, config_->b_nnz, - config_->a_nrows, config_->b_nrows, config_->b_ncols, + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, config_->handle.get_device_allocator(), config_->handle.get_stream()); } @@ -290,54 +340,62 @@ class correlation_expanded_distances_t : public distances_t { ~correlation_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; /** - * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) - * The expanded form is more efficient for sparse data. + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. */ template class cosine_expanded_distances_t : public distances_t { public: - explicit cosine_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); - // deal with potential for 0 in denominator by forcing 0/1 instead - value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); - - // flip the similarity when both rows are 0 - bool both_empty = (q_norm == 0) && (r_norm == 0); - return 1 - ((!both_empty * cos) + both_empty); - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); } ~cosine_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -354,25 +412,34 @@ class cosine_expanded_distances_t : public distances_t { template class hellinger_expanded_distances_t : public distances_t { public: - explicit hellinger_expanded_distances_t( - const distances_config_t &config) - : config_(&config), workspace(0, config.handle.get_stream()) {} + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; @@ -384,42 +451,43 @@ class hellinger_expanded_distances_t : public distances_t { ~hellinger_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; }; template class russelrao_expanded_distances_t : public distances_t { public: - explicit russelrao_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_t n_cols = config_->a_ncols; + value_t n_cols = config_->a_ncols; value_t n_cols_inv = 1.0 / n_cols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, config_->handle.get_stream()); - auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); - auto diags = thrust::counting_iterator(0); + auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); + auto diags = thrust::counting_iterator(0); value_idx b_nrows = config_->b_nrows; - thrust::for_each(exec_policy, diags, diags + config_->a_nrows, - [=] __device__(value_idx input) { - out_dists[input * b_nrows + input] = 0.0; - }); + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); } ~russelrao_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh index 885d55ee50..7f9511ff03 100644 --- a/cpp/include/raft/sparse/distance/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -38,23 +38,33 @@ namespace raft { namespace sparse { namespace distance { -template -void unexpanded_lp_distances( - value_t *out_dists, const distances_config_t *config_, - product_f product_func, accum_f accum_func, write_f write_func) { +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - coo_rows.data(), config_->a_nnz, + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv_rev( @@ -71,48 +81,51 @@ void unexpanded_lp_distances( template class l1_unexpanded_distances_t : public distances_t { public: - l1_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Sum(), AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class l2_unexpanded_distances_t : public distances_t { public: - l2_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, SqDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, SqDiff(), Sum(), AtomicAdd()); } protected: - const distances_config_t *config_; + const distances_config_t* config_; }; template -class l2_sqrt_unexpanded_distances_t - : public l2_unexpanded_distances_t { +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { public: - l2_sqrt_unexpanded_distances_t( - const distances_config_t &config) - : l2_unexpanded_distances_t(config) {} + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { l2_unexpanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -124,29 +137,33 @@ class l2_sqrt_unexpanded_distances_t template class linf_unexpanded_distances_t : public distances_t { public: - explicit linf_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Max(), AtomicMax()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Max(), AtomicMax()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class canberra_unexpanded_distances_t : public distances_t { public: - explicit canberra_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { value_t d = fabs(a) + fabs(b); @@ -154,70 +171,82 @@ class canberra_unexpanded_distances_t : public distances_t { // forcing 1/0 instead return ((d != 0) * fabs(a - b)) / (d + (d == 0)); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class lp_unexpanded_distances_t : public distances_t { public: - explicit lp_unexpanded_distances_t( - const distances_config_t &config, value_t p_) - : config_(&config), p(p_) {} + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, PDiff(p), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, PDiff(p), Sum(), AtomicAdd()); float one_over_p = 1.0f / p; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return pow(input, one_over_p); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; value_t p; }; template class hamming_unexpanded_distances_t : public distances_t { public: - explicit hamming_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, NotEqual(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, NotEqual(), Sum(), AtomicAdd()); value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return input * n_cols; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class jensen_shannon_unexpanded_distances_t : public distances_t { public: explicit jensen_shannon_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { - value_t m = 0.5f * (a + b); + value_t m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -227,49 +256,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { bool x_zero = x == 0; bool y_zero = y == 0; - return (-a * (!x_zero * log(x + x_zero))) + - (-b * (!y_zero * log(y + y_zero))); + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return sqrt(0.5 * input); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class kl_divergence_unexpanded_distances_t : public distances_t { public: explicit kl_divergence_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return 0.5 * input; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh index 89acda8b1a..3a9d0ba879 100644 --- a/cpp/include/raft/sparse/distance/operators.cuh +++ b/cpp/include/raft/sparse/distance/operators.cuh @@ -24,21 +24,24 @@ namespace distance { struct Sum { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a + b; } }; struct NotEqual { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a != b; } }; struct SqDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return (a - b) * (a - b); } }; @@ -49,44 +52,48 @@ struct PDiff { PDiff(float p_) : p(p_) {} template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return pow(a - b, p); } }; struct Max { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fmax(a, b); } }; struct AtomicAdd { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicAdd(a, b); } }; struct AtomicMax { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicMax(a, b); } }; struct Product { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a * b; } }; struct AbsDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fabs(a - b); } }; diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index 6b6d77a2d5..d78b927e46 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -34,10 +34,10 @@ namespace distance { * @return the maximum number of columns that can be stored in smem */ template -inline int max_cols_per_block() { +inline int max_cols_per_block() +{ // max cols = (total smem available - cub reduction smem) - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(value_t); } diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 29f541498b..1738dd7498 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -37,13 +37,15 @@ class linkage_output { value_idx n_leaves; value_idx n_connected_components; - value_idx *labels; // size: m + value_idx* labels; // size: m - value_idx *children; // size: (m-1, 2) + value_idx* children; // size: (m-1, 2) }; -class linkage_output_int_float : public linkage_output {}; -class linkage_output__int64_float : public linkage_output {}; +class linkage_output_int_float : public linkage_output { +}; +class linkage_output__int64_float : public linkage_output { +}; }; // namespace hierarchy }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 1ac075489a..95df7f4642 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -42,31 +42,32 @@ class UnionFind { value_idx n_indices; UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), - parent(2 * N_ - 1, -1), - size(2 * N_ - 1, 1), - next_label(N_) { + : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) + { memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); } - value_idx find(value_idx n) { + value_idx find(value_idx n) + { value_idx p; p = n; - while (parent[n] != -1) n = parent[n]; + while (parent[n] != -1) + n = parent[n]; // path compression while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; + p = parent[p == -1 ? n_indices - 1 : p]; parent[p == -1 ? n_indices - 1 : p] = n; } return n; } - void perform_union(value_idx m, value_idx n) { + void perform_union(value_idx m, value_idx n) + { size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; + parent[m] = next_label; + parent[n] = next_label; next_label += 1; } @@ -95,12 +96,17 @@ class UnionFind { * @param[out] out_size cluster sizes of output */ template -void build_dendrogram_host(const handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *data, - size_t nnz, value_idx *children, value_t *out_delta, - value_idx *out_size) { +void build_dendrogram_host(const handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* data, + size_t nnz, + value_idx* children, + value_t* out_delta, + value_idx* out_size) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -121,8 +127,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, UnionFind U(nnz + 1); for (value_idx i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; + value_idx a = mst_src_h[i]; + value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; value_idx aa = U.find(a); @@ -130,10 +136,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, value_idx children_idx = i * 2; - children_h[children_idx] = aa; + children_h[children_idx] = aa; children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; + out_delta_h[i] = delta; + out_size_h[i] = U.size[aa] + U.size[bb]; U.perform_union(aa, bb); } @@ -144,13 +150,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, } template -__global__ void write_levels_kernel(const value_idx *children, - value_idx *parents, value_idx n_vertices) { +__global__ void write_levels_kernel(const value_idx* children, + value_idx* parents, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { value_idx level = tid / 2; value_idx child = children[tid]; - parents[child] = level; + parents[child] = level; } } @@ -166,14 +174,17 @@ __global__ void write_levels_kernel(const value_idx *children, * @param labels */ template -__global__ void inherit_labels(const value_idx *children, - const value_idx *levels, size_t n_leaves, - value_idx *labels, int cut_level, - value_idx n_vertices) { +__global__ void inherit_labels(const value_idx* children, + const value_idx* levels, + size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { - value_idx node = children[tid]; + value_idx node = children[tid]; value_idx cur_level = tid / 2; /** @@ -183,12 +194,12 @@ __global__ void inherit_labels(const value_idx *children, if (cur_level > cut_level) return; value_idx cur_parent = node; - value_idx label = labels[cur_parent]; + value_idx label = labels[cur_parent]; while (label == -1) { cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; + cur_level = levels[cur_parent]; + label = labels[cur_parent]; } labels[node] = label; @@ -197,15 +208,16 @@ __global__ void inherit_labels(const value_idx *children, template struct init_label_roots { - init_label_roots(value_idx *labels_) : labels(labels_) {} + init_label_roots(value_idx* labels_) : labels(labels_) {} template - __host__ __device__ void operator()(Tuple t) { + __host__ __device__ void operator()(Tuple t) + { labels[thrust::get<1>(t)] = thrust::get<0>(t); } private: - value_idx *labels; + value_idx* labels; }; /** @@ -221,11 +233,14 @@ struct init_label_roots { * @param n_leaves */ template -void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, size_t n_clusters, - size_t n_leaves) { - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); +void extract_flattened_clusters(const raft::handle_t& handle, + value_idx* labels, + const value_idx* children, + size_t n_clusters, + size_t n_leaves) +{ + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Handle special case where n_clusters == 1 @@ -243,10 +258,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, size_t n_edges = (n_leaves - 1) * 2; - thrust::device_ptr d_ptr = - thrust::device_pointer_cast(children); - value_idx n_vertices = - *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; + thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); + value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; // Prevent potential infinite loop from labeling disconnected // connectivities graph. @@ -257,8 +270,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector levels(n_vertices, stream); value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), - n_vertices); + write_levels_kernel<<>>(children, levels.data(), n_vertices); /** * Step 1: Find label roots: * @@ -272,27 +284,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector label_roots(child_size, stream); value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, - child_size, stream); + raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); - thrust::sort(thrust_policy, label_roots.data(), + thrust::sort(thrust_policy, + label_roots.data(), label_roots.data() + (child_size), thrust::greater()); rmm::device_uvector tmp_labels(n_vertices, stream); // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), - tmp_labels.data() + n_vertices, -1); + thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); // Write labels for cluster roots to "labels" thrust::counting_iterator first(0); - auto z_iter = thrust::make_zip_iterator(thrust::make_tuple( - first, label_roots.data() + (label_roots.size() - n_clusters))); + auto z_iter = thrust::make_zip_iterator( + thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); - thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters, - init_label_roots(tmp_labels.data())); + thrust::for_each( + thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); /** * Step 2: Propagate labels by having children iterate through their parents @@ -302,9 +313,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, */ value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - inherit_labels<<>>(children, levels.data(), - n_leaves, tmp_labels.data(), - cut_level, n_vertices); + inherit_labels<<>>( + children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); // copy tmp labels to actual labels raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 7cf959dda6..096f1c650f 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -37,14 +37,17 @@ namespace raft { namespace hierarchy { namespace detail { -template +template struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c); }; /** @@ -53,50 +56,51 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, +struct distance_graph_impl { + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) + { + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, - c); + raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); // self-loops get max distance - auto transform_in = thrust::make_zip_iterator(thrust::make_tuple( - knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform( - exec_policy, transform_in, transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple &tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); - - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - m + 1, d_alloc, stream); + auto transform_in = thrust::make_zip_iterator( + thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); + + thrust::transform(exec_policy, + transform_in, + transform_in + knn_graph_coo.nnz, + knn_graph_coo.vals(), + [=] __device__(const thrust::tuple& tup) { + bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); + return (self_loop * std::numeric_limits::max()) + + (!self_loop * thrust::get<2>(tup)); + }); + + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, d_alloc, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors // ref: https://github.com/rapidsai/raft/issues/227 - raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, - stream); - raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, - stream); + raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream); + raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream); } }; @@ -116,13 +120,17 @@ struct distance_graph_impl -void get_distance_graph(const raft::handle_t &handle, const value_t *X, - size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { +template +void get_distance_graph(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) +{ auto stream = handle.get_stream(); indptr.resize(m + 1, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 765a5ad77f..f939e87484 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -37,9 +37,10 @@ namespace hierarchy { namespace detail { template -void merge_msts(raft::Graph_COO &coo1, - raft::Graph_COO &coo2, - cudaStream_t stream) { +void merge_msts(raft::Graph_COO& coo1, + raft::Graph_COO& coo2, + cudaStream_t stream) +{ /** Add edges to existing mst **/ int final_nnz = coo2.n_edges + coo1.n_edges; @@ -50,12 +51,9 @@ void merge_msts(raft::Graph_COO &coo1, /** * Construct final edge list */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), - coo2.n_edges, stream); + raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); + raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); + raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); coo1.n_edges = final_nnz; } @@ -74,14 +72,18 @@ void merge_msts(raft::Graph_COO &coo1, * @return updated MST edge list */ template -void connect_knn_graph(const raft::handle_t &handle, const value_t *X, - raft::Graph_COO &msf, - size_t m, size_t n, value_idx *color, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_knn_graph( + const raft::handle_t& handle, + const value_t* X, + raft::Graph_COO& msf, + size_t m, + size_t n, + value_idx* color, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); raft::sparse::COO connected_edges(d_alloc, stream); @@ -89,15 +91,21 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), - connected_edges.nnz, indptr2.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, d_alloc, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst( - handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m, - connected_edges.nnz, color, stream, false, false); + auto new_mst = raft::mst::mst(handle, + indptr2.data(), + connected_edges.cols(), + connected_edges.vals(), + m, + connected_edges.nnz, + color, + stream, + false, + false); merge_msts(msf, new_mst, stream); } @@ -127,29 +135,35 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, * argument is really just a safeguard against the potential for infinite loops. */ template -void build_sorted_mst(const raft::handle_t &handle, const value_t *X, - const value_idx *indptr, const value_idx *indices, - const value_t *pw_dists, size_t m, size_t n, - value_idx *mst_src, value_idx *mst_dst, - value_t *mst_weight, value_idx *color, size_t nnz, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) { +void build_sorted_mst( + const raft::handle_t& handle, + const value_t* X, + const value_idx* indptr, + const value_idx* indices, + const value_t* pw_dists, + size_t m, + size_t n, + value_idx* mst_src, + value_idx* mst_dst, + value_t* mst_weight, + value_idx* color, + size_t nnz, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, + int max_iter = 10) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. auto mst_coo = raft::mst::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, - true); + handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); - int iters = 1; + int iters = 1; int n_components = linkage::get_n_components(color, m, d_alloc, stream); while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, - reduction_op); + connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); iters++; @@ -176,9 +190,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, " or increase 'max_iter'", max_iter); - raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(), - mst_coo.weights.data(), mst_coo.n_edges, - stream); + raft::sparse::op::coo_sort_by_weight( + mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 01a033945c..fe9538120f 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -44,20 +44,26 @@ static const size_t EMPTY = 0; * @param[in] n number of columns in X * @param[in] metric distance metrix to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control * of k. The algorithm will set `k = log(n) + c` * @param[in] n_clusters number of clusters to assign data samples */ -template -void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, - size_t n, raft::distance::DistanceType metric, - linkage_output *out, int c, - size_t n_clusters) { - ASSERT(n_clusters <= m, - "n_clusters must be less than or equal to the number of data points"); - - auto stream = handle.get_stream(); +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); + + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); @@ -79,10 +85,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, */ rmm::device_uvector color(m, stream); raft::linkage::FixConnectivitiesRedOp op(color.data(), m); - detail::build_sorted_mst( - handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, - mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(), - indices.size(), op, metric); + detail::build_sorted_mst(handle, + X, + indptr.data(), + indices.data(), + pw_dists.data(), + m, + n, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + color.data(), + indices.size(), + op, + metric); pw_dists.release(); @@ -94,15 +110,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, rmm::device_uvector out_delta(n_edges, stream); rmm::device_uvector out_size(n_edges, stream); // Create dendrogram - detail::build_dendrogram_host( - handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges, - out->children, out_delta.data(), out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, - n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; + detail::build_dendrogram_host(handle, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + n_edges, + out->children, + out_delta.data(), + out_size.data()); + detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); + + out->m = m; + out->n_clusters = n_clusters; + out->n_leaves = m; out->n_connected_components = 1; } diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 47b1ba6e41..01735a102d 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -40,40 +40,47 @@ namespace sparse { namespace linalg { template -__global__ void csr_add_calc_row_counts_kernel( - const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, - const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, - int *out_rowcounts) { +__global__ void csr_add_calc_row_counts_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_rowcounts) +{ // loop through columns in each set of rows and // calculate number of unique cols across both rows int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); - int *arr = new int[max_size]; + int* arr = new int[max_size]; int cur_arr_idx = 0; for (int j = a_start_idx; j < a_stop_idx; j++) { arr[cur_arr_idx] = a_indptr[j]; cur_arr_idx++; } - int arr_size = cur_arr_idx; + int arr_size = cur_arr_idx; int final_size = arr_size; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = 0; k < arr_size; k++) { if (arr[k] == cur_col) { found = true; @@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel( } } - if (!found) { - final_size++; - } + if (!found) { final_size++; } } out_rowcounts[row] = final_size; @@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel( } template -__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, - const T *a_val, int nnz1, const int *b_ind, - const int *b_indptr, const T *b_val, int nnz2, - int m, int *out_ind, int *out_indptr, - T *out_val) { +__global__ void csr_add_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + int* out_indptr, + T* out_val) +{ // 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); int o_idx = out_ind[row]; int cur_o_idx = o_idx; for (int j = a_start_idx; j < a_stop_idx; j++) { out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; + out_val[cur_o_idx] = a_val[j]; cur_o_idx++; } int arr_size = cur_o_idx - o_idx; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = o_idx; k < o_idx + arr_size; k++) { // If we found a match, sum the two values if (out_indptr[k] == cur_col) { @@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, // if we didn't find a match, add the value for b if (!found) { out_indptr[o_idx + arr_size] = cur_col; - out_val[o_idx + arr_size] = b_val[j]; + out_val[o_idx + arr_size] = b_val[j]; arr_size++; } } @@ -160,32 +173,36 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param stream: cuda stream to use */ template -size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *out_ind, +size_t csr_add_calc_inds(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); - CUDA_CHECK( - cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); - csr_add_calc_row_counts_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, row_counts.data()); + csr_add_calc_row_counts_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; } @@ -208,16 +225,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, * @param stream: cuda stream to use */ template -void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *c_ind, - int *c_indptr, T *c_val, cudaStream_t stream) { +void csr_add_finalize(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* c_ind, + int* c_indptr, + T* c_val, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_add_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, c_ind, c_indptr, c_val); + csr_add_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index 9bd322c90a..77a9445ab1 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -44,11 +44,10 @@ namespace linalg { * @param results array to place results */ template -__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { +__global__ void coo_degree_kernel(const int* rows, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { * @param stream: cuda stream to use */ template -void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { +void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -77,31 +77,28 @@ void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_degree(COO *in, int *results, cudaStream_t stream) { +void coo_degree(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_kernel - <<>>(in->rows(), in->nnz, results); + coo_degree_kernel<<>>(in->rows(), in->nnz, results); CUDA_CHECK(cudaGetLastError()); } template -__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, - int *results) { +__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != 0.0) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); } } template -__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, - int nnz, T scalar, int *results) { +__global__ void coo_degree_scalar_kernel( + const int* rows, const T* vals, int nnz, T scalar, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != scalar) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, * @param stream: cuda stream to use */ template -void coo_degree_scalar(COO *in, T scalar, int *results, - cudaStream_t stream) { +void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_scalar_kernel<<>>( - in->rows(), in->vals(), in->nnz, scalar, results); + coo_degree_scalar_kernel + <<>>(in->rows(), in->vals(), in->nnz, scalar, results); CUDA_CHECK(cudaGetLastError()); } @@ -135,8 +132,9 @@ void coo_degree_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, - int *results, cudaStream_t stream = 0) { +void coo_degree_scalar( + const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); coo_degree_scalar_kernel @@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, - cudaStream_t stream) { +void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_nz_kernel - <<>>(rows, vals, nnz, results); + coo_degree_nz_kernel<<>>(rows, vals, nnz, results); } /** @@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { +void coo_degree_nz(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh index bfcd3fd592..59dc5ff3e4 100644 --- a/cpp/include/raft/sparse/linalg/norm.cuh +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ex_scan (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result) { // output array + const int* ia, // csr row ex_scan (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel( // sum all vals_arr for row and divide each val by sum if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel( for (int j = start_idx; j < stop_idx; j++) { if (sum != 0.0) { - T val = vals[j]; + T val = vals[j]; result[j] = val / sum; } else { result[j] = 0.0; @@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel( * @param stream: cuda stream to use */ template -void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) - const T *vals, +void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr - T *result, - cudaStream_t stream) { // output array + T* result, + cudaStream_t stream) +{ // output array dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_l1_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } @@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ind array (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result) { // output array + const int* ia, // csr row ind array (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel( // find max across columns and divide if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel( // divide nonzeros in current row by max for (int j = start_idx; j < stop_idx; j++) { if (max != 0.0 && max > std::numeric_limits::min()) { - T val = vals[j]; + T val = vals[j]; result[j] = val / max; } else { result[j] = 0.0; @@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel( */ template -void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) - const T *vals, +void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr - T *result, cudaStream_t stream) { + T* result, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_max_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 15302f3b74..3b609d994f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -31,16 +31,23 @@ namespace sparse { namespace spectral { template -void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, - int nnz, int n, int n_components, T *out, - unsigned long long seed = 1234567) { - auto stream = handle.get_stream(); +void fit_embedding(const raft::handle_t& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); - convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), - dst_cols.data(), dst_vals.data()); + convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); @@ -54,48 +61,53 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using index_type = int; using value_type = T; - index_type *ro = src_offsets.data(); - index_type *ci = dst_cols.data(); - value_type *vs = dst_vals.data(); + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); - raft::matrix::sparse_matrix_t const r_csr_m{ - handle, ro, ci, vs, n, nnz}; + raft::matrix::sparse_matrix_t const r_csr_m{handle, ro, ci, vs, n, nnz}; - index_type neigvs = n_components + 1; - index_type maxiter = 4000; //default reset value (when set to 0); - value_type tol = 0.01; - index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using + auto t_exe_p = thrust::cuda::par.on(stream); using thrust_exe_policy_t = decltype(t_exe_p); - raft::eigen_solver_config_t cfg{neigvs, maxiter, - restart_iter, tol}; + raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; cfg.seed = seed; raft::lanczos_solver_t eig_solver{cfg}; - //cluster computation here is irrelevant, - //hence define a no-op such solver to - //feed partition(): + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): // struct no_op_cluster_solver_t { using index_type_t = index_type; - using size_type_t = index_type; + using size_type_t = index_type; using value_type_t = value_type; - std::pair solve( - handle_t const &handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, - value_type_t const *__restrict__ obs, - index_type_t *__restrict__ codes) const { + std::pair solve(handle_t const& handle, + thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, - no_op_cluster_solver_t{}, labels.data(), - eigVals.data(), eigVecs.data()); + raft::spectral::partition(handle, + t_exe_p, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); raft::copy(out, eigVecs.data() + n, n * n_components, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5c2c78f0c3..b9426c284a 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -49,26 +49,34 @@ namespace linalg { // TODO: value_idx param needs to be used for this once FAISS is updated to use float32 // for indices so that the index types can be uniform template -__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, - T *vals, int *orows, int *ocols, T *ovals, - int n, int cnnz, Lambda reduction_op) { +__global__ void coo_symmetrize_kernel(int* row_ind, + int* rows, + int* cols, + T* vals, + int* orows, + int* ocols, + T* ovals, + int n, + int cnnz, + Lambda reduction_op) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < n) { int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); - int row_nnz = 0; + int row_nnz = 0; int out_start_idx = start_idx * 2; for (int idx = 0; idx < stop_idx - start_idx; idx++) { int cur_row = rows[idx + start_idx]; int cur_col = cols[idx + start_idx]; - T cur_val = vals[idx + start_idx]; + T cur_val = vals[idx + start_idx]; int lookup_row = cur_col; - int t_start = row_ind[lookup_row]; // Start at - int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); T transpose = 0.0; @@ -79,7 +87,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, // done in a different thread. if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { // If it exists already, set transposed value to existing value - transpose = vals[t_idx]; + transpose = vals[t_idx]; found_match = true; break; } @@ -126,10 +134,12 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, COO *out, +void coo_symmetrize(COO* in, + COO* out, Lambda reduction_op, // two-argument reducer std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -141,9 +151,16 @@ void coo_symmetrize(COO *in, COO *out, out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); - coo_symmetrize_kernel<<>>( - in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), - out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); + coo_symmetrize_kernel<<>>(in_row_ind.data(), + in->rows(), + in->cols(), + in->vals(), + out->rows(), + out->cols(), + out->vals(), + in->n_rows, + in->nnz, + reduction_op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -159,14 +176,15 @@ void coo_symmetrize(COO *in, COO *out, * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction */ template -__global__ static void symmetric_find_size(const value_t *restrict data, - const value_idx *restrict indices, - const value_idx n, const int k, - value_idx *restrict row_sizes, - value_idx *restrict row_sizes2) { +__global__ static void symmetric_find_size(const value_t* restrict data, + const value_idx* restrict indices, + const value_idx n, + const int k, + value_idx* restrict row_sizes, + value_idx* restrict row_sizes2) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; const auto col = indices[row * k + j]; @@ -186,9 +204,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data, * @param row_sizes2: Input row sum 2 array(n) for faster reduction */ template -__global__ static void reduce_find_size(const value_idx n, const int k, - value_idx *restrict row_sizes, - const value_idx *restrict row_sizes2) { +__global__ static void reduce_find_size(const value_idx n, + const int k, + value_idx* restrict row_sizes, + const value_idx* restrict row_sizes2) +{ const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; if (i >= n) return; row_sizes[i] += (row_sizes2[i] + k); @@ -209,20 +229,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k, * @param k: Number of n_neighbors */ template -__global__ static void symmetric_sum(value_idx *restrict edges, - const value_t *restrict data, - const value_idx *restrict indices, - value_t *restrict VAL, - value_idx *restrict COL, - value_idx *restrict ROW, const value_idx n, - const int k) { +__global__ static void symmetric_sum(value_idx* restrict edges, + const value_t* restrict data, + const value_idx* restrict indices, + value_t* restrict VAL, + value_idx* restrict COL, + value_idx* restrict ROW, + const value_idx n, + const int k) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; - const auto col = indices[row * k + j]; - const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); const auto transpose = atomicAdd(&edges[col], value_idx(1)); VAL[transpose] = VAL[original] = data[row * k + j]; @@ -252,26 +273,26 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param stream: Input cuda stream * @param d_alloc device allocator for temporary buffers */ -template -void from_knn_symmetrize_matrix( - const value_idx *restrict knn_indices, const value_t *restrict knn_dists, - const value_idx n, const int k, COO *out, - cudaStream_t stream, std::shared_ptr d_alloc) { +template +void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, + const value_t* restrict knn_dists, + const value_idx n, + const int k, + COO* out, + cudaStream_t stream, + std::shared_ptr d_alloc) +{ // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); - const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), - raft::ceildiv(k, TPB_Y)); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) raft::mr::device::buffer row_sizes(d_alloc, stream, n); - CUDA_CHECK( - cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); raft::mr::device::buffer row_sizes2(d_alloc, stream, n); - CUDA_CHECK( - cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); @@ -292,14 +313,12 @@ void from_knn_symmetrize_matrix( // This mirrors CSR matrix's row Pointer, were maximum bounds for each row // are calculated as the cumulative rolling sum of the previous rows. // Notice reusing old row_sizes2 memory - value_idx *edges = row_sizes2.data(); - thrust::device_ptr __edges = thrust::device_pointer_cast(edges); - thrust::device_ptr __row_sizes = - thrust::device_pointer_cast(row_sizes.data()); + value_idx* edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, - __row_sizes + n, __edges); + thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, __row_sizes + n, __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -311,11 +330,17 @@ void from_knn_symmetrize_matrix( * Symmetrizes a COO matrix */ template -void symmetrize(const raft::handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *vals, size_t m, size_t n, - size_t nnz, raft::sparse::COO &out) { +void symmetrize(const raft::handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t m, + size_t n, + size_t nnz, + raft::sparse::COO& out) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // copy rows to cols and cols to rows rmm::device_uvector symm_rows(nnz * 2, stream); @@ -331,13 +356,17 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream); // sort COO - raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, - symm_rows.data(), symm_cols.data(), - symm_vals.data(), d_alloc, stream); - - raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), - symm_cols.data(), symm_vals.data(), nnz * 2, - m, n); + raft::sparse::op::coo_sort((value_idx)m, + (value_idx)n, + (value_idx)nnz * 2, + symm_rows.data(), + symm_cols.data(), + symm_vals.data(), + d_alloc, + stream); + + raft::sparse::op::max_duplicates( + handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 6afe4ca8f6..ce90eb6702 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -57,29 +57,55 @@ namespace linalg { * @param[in] stream : Cuda stream for ordering events */ template -void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, - const value_idx *csr_indices, const value_t *csr_data, - value_idx *csc_indptr, value_idx *csc_indices, - value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, +void csr_transpose(cusparseHandle_t handle, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx* csc_indptr, + value_idx* csc_indices, + value_t* csc_data, + value_idx csr_nrows, + value_idx csr_ncols, value_idx nnz, std::shared_ptr allocator, - cudaStream_t stream) { + cudaStream_t stream) +{ size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, + stream)); raft::mr::device::buffer convert_csc_workspace( allocator, stream, convert_csc_workspace_size); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), + stream)); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh index f0d30b0cb7..36d426029b 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh @@ -28,10 +28,16 @@ namespace mst { namespace detail { template -__global__ void kernel_min_edge_per_vertex( - const edge_t* offsets, const vertex_t* indices, const alteration_t* weights, - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) { +__global__ void kernel_min_edge_per_vertex(const edge_t* offsets, + const vertex_t* indices, + const alteration_t* weights, + const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + const bool* mst_edge, + alteration_t* min_edge_color, + const vertex_t v) +{ edge_t tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned warp_id = tid / 32; @@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex( __shared__ alteration_t min_edge_weight[32]; __shared__ vertex_t min_color[32]; - min_edge_index[lane_id] = std::numeric_limits::max(); + min_edge_index[lane_id] = std::numeric_limits::max(); min_edge_weight[lane_id] = std::numeric_limits::max(); - min_color[lane_id] = std::numeric_limits::max(); + min_color[lane_id] = std::numeric_limits::max(); __syncthreads(); vertex_t self_color_idx = color_index[warp_id]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // find the minimum edge associated per row // each thread in warp holds the minimum edge for @@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex( if (warp_id < v) { // one row is associated with one warp edge_t row_start = offsets[warp_id]; - edge_t row_end = offsets[warp_id + 1]; + edge_t row_end = offsets[warp_id + 1]; // assuming one warp per row // find min for each thread in warp for (edge_t e = row_start + lane_id; e < row_end; e += 32) { alteration_t curr_edge_weight = weights[e]; - vertex_t successor_color_idx = color_index[indices[e]]; - vertex_t successor_color = color[successor_color_idx]; + vertex_t successor_color_idx = color_index[indices[e]]; + vertex_t successor_color = color[successor_color_idx]; if (!mst_edge[e] && self_color != successor_color) { if (curr_edge_weight < min_edge_weight[lane_id]) { - min_color[lane_id] = successor_color; + min_color[lane_id] = successor_color; min_edge_weight[lane_id] = curr_edge_weight; - min_edge_index[lane_id] = e; + min_edge_index[lane_id] = e; } } } @@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex( for (int offset = 16; offset > 0; offset >>= 1) { if (lane_id < offset) { if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) { - min_color[lane_id] = min_color[lane_id + offset]; + min_color[lane_id] = min_color[lane_id + offset]; min_edge_weight[lane_id] = min_edge_weight[lane_id + offset]; - min_edge_index[lane_id] = min_edge_index[lane_id + offset]; + min_edge_index[lane_id] = min_edge_index[lane_id + offset]; } } __syncthreads(); @@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex( } } -template -__global__ void min_edge_per_supervertex( - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - bool* mst_edge, const vertex_t* indices, const weight_t* weights, - const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst, - weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v, - bool symmetrize_output) { +template +__global__ void min_edge_per_supervertex(const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + bool* mst_edge, + const vertex_t* indices, + const weight_t* weights, + const alteration_t* altered_weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const alteration_t* min_edge_color, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { vertex_t vertex_color_idx = color_index[tid]; - vertex_t vertex_color = color[vertex_color_idx]; - edge_t edge_idx = new_mst_edge[tid]; + vertex_t vertex_color = color[vertex_color_idx]; + edge_t edge_idx = new_mst_edge[tid]; // check if valid outgoing edge was found // find minimum edge is same as minimum edge of whole supervertex @@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex( auto dst = indices[edge_idx]; if (!symmetrize_output) { auto dst_edge_idx = new_mst_edge[dst]; - auto dst_color = color[color_index[dst]]; + auto dst_color = color[color_index[dst]]; // vertices added each other // only if destination has found an edge // the edge points back to source // the edge is minimum edge found for dst color - if (dst_edge_idx != std::numeric_limits::max() && - indices[dst_edge_idx] == tid && + if (dst_edge_idx != std::numeric_limits::max() && indices[dst_edge_idx] == tid && min_edge_color[dst_color] == altered_weights[dst_edge_idx]) { - if (vertex_color > dst_color) { - add_edge = false; - } + if (vertex_color > dst_color) { add_edge = false; } } } if (add_edge) { - temp_src[tid] = tid; - temp_dst[tid] = dst; - temp_weights[tid] = weights[edge_idx]; + temp_src[tid] = tid; + temp_dst[tid] = dst; + temp_weights[tid] = weights[edge_idx]; mst_edge[edge_idx] = true; } } - if (!add_edge) { - new_mst_edge[tid] = std::numeric_limits::max(); - } + if (!add_edge) { new_mst_edge[tid] = std::numeric_limits::max(); } } } } @@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex( template __global__ void add_reverse_edge(const edge_t* new_mst_edge, const vertex_t* indices, - const weight_t* weights, vertex_t* temp_src, - vertex_t* temp_dst, weight_t* temp_weights, - const vertex_t v, bool symmetrize_output) { + const weight_t* weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { @@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // if vertices did not pick each other // add a reverse edge - if (tid != neighbor_vertex_neighbor) { - reverse_needed = true; - } + if (tid != neighbor_vertex_neighbor) { reverse_needed = true; } } } @@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // it is assumed the each vertex only picks one valid min edge // per cycle // hence, we store at index tid + v for the reverse edge scenario - temp_src[tid + v] = neighbor_vertex; - temp_dst[tid + v] = tid; + temp_src[tid + v] = neighbor_vertex; + temp_dst[tid + v] = tid; temp_weights[tid + v] = weights[edge_idx]; } } @@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // executes for newly added mst edges and updates the colors of both vertices to the lower color template -__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, +__global__ void min_pair_colors(const vertex_t v, + const vertex_t* indices, const edge_t* new_mst_edge, const vertex_t* color, const vertex_t* color_index, - vertex_t* next_color) { + vertex_t* next_color) +{ auto i = get_1D_idx(); if (i < v) { @@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, if (edge_idx != std::numeric_limits::max()) { vertex_t neighbor_vertex = indices[edge_idx]; // vertex_t self_color = color[i]; - vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; - vertex_t neighbor_color_idx = color_index[neighbor_vertex]; + vertex_t self_color_idx = color_index[i]; + vertex_t self_color = color[self_color_idx]; + vertex_t neighbor_color_idx = color_index[neighbor_vertex]; vertex_t neighbor_super_color = color[neighbor_color_idx]; // update my own color as source of edge @@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, // for each vertex, update color if it was changed in min_pair_colors kernel template -__global__ void update_colors(const vertex_t v, vertex_t* color, +__global__ void update_colors(const vertex_t v, + vertex_t* color, const vertex_t* color_index, - const vertex_t* next_color, bool* done) { + const vertex_t* next_color, + bool* done) +{ auto i = get_1D_idx(); if (i < v) { - vertex_t self_color = color[i]; + vertex_t self_color = color[i]; vertex_t self_color_idx = color_index[i]; - vertex_t new_color = next_color[self_color_idx]; + vertex_t new_color = next_color[self_color_idx]; // update self color to new smaller color if (self_color > new_color) { color[i] = new_color; - *done = false; + *done = false; } } } // point vertices to their final color index template -__global__ void final_color_indices(const vertex_t v, const vertex_t* color, - vertex_t* color_index) { +__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index) +{ auto i = get_1D_idx(); if (i < v) { vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // if self color is not equal to self color index, // it means self is not supervertex @@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // parent supervertex while (self_color_idx != self_color) { self_color_idx = color_index[self_color]; - self_color = color[self_color_idx]; + self_color = color[self_color_idx]; } // point to new supervertex @@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu // Consider using curand device API instead of precomputed random_values array -template -__global__ void alteration_kernel(const vertex_t v, const edge_t e, +template +__global__ void alteration_kernel(const vertex_t v, + const edge_t e, const edge_t* offsets, const vertex_t* indices, - const weight_t* weights, alteration_t max, + const weight_t* weights, + alteration_t max, alteration_t* random_values, - alteration_t* altered_weights) { + alteration_t* altered_weights) +{ auto row = get_1D_idx(); if (row < v) { auto row_begin = offsets[row]; - auto row_end = offsets[row + 1]; + auto row_end = offsets[row + 1]; for (auto i = row_begin; i < row_end; i++) { - auto column = indices[i]; - altered_weights[i] = - weights[i] + max * (random_values[row] + random_values[column]); + auto column = indices[i]; + altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]); } } } @@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e, template __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, edge_t* mst_edge_count, - const vertex_t v) { + const vertex_t v) +{ auto tid = get_1D_idx(); // count number of new mst edges added - bool predicate = - tid < v && (mst_src[tid] != std::numeric_limits::max()); + bool predicate = tid < v && (mst_src[tid] != std::numeric_limits::max()); vertex_t block_count = __syncthreads_count(predicate); - if (threadIdx.x == 0 && block_count > 0) { - atomicAdd(mst_edge_count, block_count); - } + if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } } } // namespace detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index c5ba4fcb4f..158f4cc314 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -46,21 +46,30 @@ typedef std::chrono::high_resolution_clock Clock; // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - float* outputPtr, size_t n) { + float* outputPtr, + size_t n) +{ return curandGenerateUniform(generator, outputPtr, n); } inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - double* outputPtr, size_t n) { + double* outputPtr, + size_t n) +{ return curandGenerateUniformDouble(generator, outputPtr, n); } -template -MST_solver::MST_solver( - const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, const vertex_t v_, - const edge_t e_, vertex_t* color_, cudaStream_t stream_, - bool symmetrize_output_, bool initialize_colors_, int iterations_) +template +MST_solver::MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_) : handle(handle_), offsets(offsets_), indices(indices_), @@ -82,12 +91,13 @@ MST_solver::MST_solver( stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), - iterations(iterations_) { - max_blocks = handle_.get_device_properties().maxGridSize[0]; + iterations(iterations_) +{ + max_blocks = handle_.get_device_properties().maxGridSize[0]; max_threads = handle_.get_device_properties().maxThreadsPerBlock; - sm_count = handle_.get_device_properties().multiProcessorCount; + sm_count = handle_.get_device_properties().multiProcessorCount; - //Initially, color holds the vertex id as color + // Initially, color holds the vertex id as color auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); @@ -98,10 +108,10 @@ MST_solver::MST_solver( thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } -template +template raft::Graph_COO -MST_solver::solve() { +MST_solver::solve() +{ RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); RAFT_EXPECTS(offsets != nullptr, "Null offsets."); @@ -114,12 +124,13 @@ MST_solver::solve() { // Alterating the weights // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta. - // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight. + // For each edge, add noise that is less than theta. That is, generate a random number in the + // range [0.0, theta) and add it to each edge weight. alteration(); #ifdef MST_TIME auto stop = Clock::now(); - timer0 = duration_us(stop - start); + timer0 = duration_us(stop - start); #endif auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1; @@ -168,8 +179,8 @@ MST_solver::solve() { if (curr_mst_edge_count == prev_mst_edge_count[0]) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; - std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 - << "," << timer4 << "," << timer5 << std::endl; + std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << "," + << timer5 << std::endl; #endif // exit here when reaching steady state break; @@ -179,8 +190,7 @@ MST_solver::solve() { start = Clock::now(); #endif // append the newly found MST edges to the final output - append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), - mst_result.weights.data()); + append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data()); #ifdef MST_TIME stop = Clock::now(); timer4 += duration_us(stop - start); @@ -201,7 +211,7 @@ MST_solver::solve() { // result packaging thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = host_mst_edge_count[0]; mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -212,50 +222,46 @@ MST_solver::solve() { // ||y|-|x|| template struct alteration_functor { - __host__ __device__ weight_t - operator()(const thrust::tuple& t) { + __host__ __device__ weight_t operator()(const thrust::tuple& t) + { auto x = thrust::get<0>(t); auto y = thrust::get<1>(t); - x = x < 0 ? -x : x; - y = y < 0 ? -y : y; + x = x < 0 ? -x : x; + y = y < 0 ? -y : y; return x < y ? y - x : x - y; } }; // Compute the uper bound for the alteration -template -alteration_t -MST_solver::alteration_max() { +template +alteration_t MST_solver::alteration_max() +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); rmm::device_vector tmp(e); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); - //sort tmp weights + // sort tmp weights thrust::sort(policy, tmp.begin(), tmp.end()); - //remove duplicates + // remove duplicates auto new_end = thrust::unique(policy, tmp.begin(), tmp.end()); - //min(a[i+1]-a[i])/2 - auto begin = - thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); - auto end = - thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; - auto max = - thrust::transform_reduce(policy, begin, end, alteration_functor(), - init, thrust::minimum()); + // min(a[i+1]-a[i])/2 + auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); + auto init = tmp[1] - tmp[0]; + auto max = thrust::transform_reduce( + policy, begin, end, alteration_functor(), init, thrust::minimum()); return max / static_cast(2); } // Compute the alteration to make all undirected edge weight unique // Preserves weights order -template -void MST_solver::alteration() { +template +void MST_solver::alteration() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); // maximum alteration that does not change realtive weights order alteration_t max = alteration_max(); @@ -269,35 +275,32 @@ void MST_solver::alteration() { curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = - curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = curand_generate_uniformX(randGen, rand_values.data().get(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); - RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, - "MST: CURAND cleanup failed"); + RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed"); - //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu + // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), - altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data().get(), altered_weights.data().get()); } // updates colors of vertices by propagating the lower color to the higher -template -void MST_solver::label_prop( - vertex_t* mst_src, vertex_t* mst_dst) { +template +void MST_solver::label_prop(vertex_t* mst_src, + vertex_t* mst_dst) +{ // update the colors of both ends its until there is no change in colors thrust::host_vector curr_mst_edge_count = mst_edge_count; auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); - auto min_pair_nblocks = std::min( - (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); + auto min_pair_nblocks = + std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); rmm::device_vector done(1, false); edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); + vertex_t* color_ptr = color.data().get(); vertex_t* next_color_ptr = next_color.data().get(); bool* done_ptr = done.data().get(); @@ -314,84 +317,99 @@ void MST_solver::label_prop( i++; } - detail:: - final_color_indices<<>>( - v, color_ptr, color_index); + detail::final_color_indices<<>>( + v, color_ptr, color_index); #ifdef MST_TIME std::cout << "Label prop iterations: " << i << std::endl; #endif } // Finds the minimum edge from each vertex to the lowest color -template -void MST_solver::min_edge_per_vertex() { +template +void MST_solver::min_edge_per_vertex() +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), - std::numeric_limits::max()); - thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), - std::numeric_limits::max()); + thrust::fill( + policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); + thrust::fill( + policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits::max()); int n_threads = 32; - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - detail::kernel_min_edge_per_vertex<<>>( - offsets, indices, altered_weights_ptr, color_ptr, color_index, - new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v); + detail::kernel_min_edge_per_vertex<<>>(offsets, + indices, + altered_weights_ptr, + color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + min_edge_color_ptr, + v); } // Finds the minimum edge from each supervertex to the lowest color -template -void MST_solver::min_edge_per_supervertex() { +template +void MST_solver::min_edge_per_supervertex() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill(policy, temp_src.begin(), temp_src.end(), - std::numeric_limits::max()); + thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); - - detail::min_edge_per_supervertex<<>>( - color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, - altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr, - min_edge_color_ptr, v, symmetrize_output); + vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_dst_ptr = temp_dst.data().get(); + weight_t* temp_weights_ptr = temp_weights.data().get(); + + detail::min_edge_per_supervertex<<>>(color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + indices, + weights, + altered_weights_ptr, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + min_edge_color_ptr, + v, + symmetrize_output); // the above kernel only adds directed mst edges in the case where // a pair of vertices don't pick the same min edge between them // so, now we add the reverse edge to make it undirected if (symmetrize_output) { - detail::add_reverse_edge<<>>( - new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr, - temp_weights_ptr, v, symmetrize_output); + detail::add_reverse_edge<<>>(new_mst_edge_ptr, + indices, + weights, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + v, + symmetrize_output); } } -template -void MST_solver::check_termination() { +template +void MST_solver::check_termination() +{ vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads); - vertex_t nblocks = - std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); + vertex_t nblocks = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_src_ptr = temp_src.data().get(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -399,36 +417,40 @@ void MST_solver::check_termination() { template struct new_edges_functor { - __host__ __device__ bool operator()( - const thrust::tuple& t) { + __host__ __device__ bool operator()(const thrust::tuple& t) + { auto src = thrust::get<0>(t); return src != std::numeric_limits::max() ? true : false; } }; -template +template void MST_solver::append_src_dst_pair( - vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { + vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); auto curr_mst_edge_count = prev_mst_edge_count[0]; // iterator to end of mst edges added to final output in previous iteration - auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( - mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count, - mst_weights + curr_mst_edge_count)); + auto src_dst_zip_end = + thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count, + mst_dst + curr_mst_edge_count, + mst_weights + curr_mst_edge_count)); // iterator to new mst edges found - auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple( - temp_src.begin(), temp_dst.begin(), temp_weights.begin())); + auto temp_src_dst_zip_begin = thrust::make_zip_iterator( + thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin())); auto temp_src_dst_zip_end = thrust::make_zip_iterator( thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end())); // copy new mst edges to final output - thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end, - src_dst_zip_end, new_edges_functor()); + thrust::copy_if(policy, + temp_src_dst_zip_begin, + temp_src_dst_zip_end, + src_dst_zip_end, + new_edges_functor()); } } // namespace mst diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 8f755de459..24127c993f 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -26,32 +26,29 @@ namespace mst { namespace detail { template -__device__ idx_t get_1D_idx() { +__device__ idx_t get_1D_idx() +{ return blockIdx.x * blockDim.x + threadIdx.x; } // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", - const size_t displ = 5) { +void printv(rmm::device_vector& vec, const std::string& name = "", const size_t displ = 5) +{ #ifdef MST_TIME std::cout.precision(15); std::cout << name << " size = " << vec.size() << std::endl; if (displ < vec.size()) { - thrust::copy(vec.begin(), vec.begin() + displ, - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator(std::cout, " ")); std::cout << " ... "; - thrust::copy(vec.end() - displ, vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator(std::cout, " ")); } else { - thrust::copy(vec.begin(), vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.end(), std::ostream_iterator(std::cout, " ")); } std::cout << std::endl << std::endl; #endif } -#define duration_us(a) \ - std::chrono::duration_cast(a).count() +#define duration_us(a) std::chrono::duration_cast(a).count() } // namespace detail } // namespace mst diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index 10c981445e..b49003467b 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -22,16 +22,30 @@ namespace raft { namespace mst { -template -raft::Graph_COO mst( - const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices, - weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color, - cudaStream_t stream, bool symmetrize_output = true, - bool initialize_colors = true, int iterations = 0) { - MST_solver mst_solver( - handle, offsets, indices, weights, v, e, color, stream, symmetrize_output, - initialize_colors, iterations); +template +raft::Graph_COO mst(const raft::handle_t& handle, + edge_t const* offsets, + vertex_t const* indices, + weight_t const* weights, + vertex_t const v, + edge_t const e, + vertex_t* color, + cudaStream_t stream, + bool symmetrize_output = true, + bool initialize_colors = true, + int iterations = 0) +{ + MST_solver mst_solver(handle, + offsets, + indices, + weights, + v, + e, + color, + stream, + symmetrize_output, + initialize_colors, + iterations); return mst_solver.solve(); } diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 833882ea0d..e32bcfacac 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -31,20 +31,27 @@ struct Graph_COO { edge_t n_edges; Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) {} + : src(size, stream), dst(size, stream), weights(size, stream) + { + } }; namespace mst { -template +template class MST_solver { public: - MST_solver(const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, - const vertex_t v_, const edge_t e_, vertex_t* color_, - cudaStream_t stream_, bool symmetrize_output_, - bool initialize_colors_, int iterations_); + MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_); raft::Graph_COO solve(); @@ -56,7 +63,7 @@ class MST_solver { bool symmetrize_output, initialize_colors; int iterations; - //CSR + // CSR const edge_t* offsets; const vertex_t* indices; const weight_t* weights; @@ -67,20 +74,16 @@ class MST_solver { vertex_t max_threads; vertex_t sm_count; - vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector - min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector - altered_weights; // weights to be used for mst + vertex_t* color_index; // represent each supervertex as a color + rmm::device_vector min_edge_color; // minimum incident edge weight per color + rmm::device_vector new_mst_edge; // new minimum edge per vertex + rmm::device_vector altered_weights; // weights to be used for mst + rmm::device_vector mst_edge_count; // total number of edges added after every iteration rmm::device_vector - mst_edge_count; // total number of edges added after every iteration - rmm::device_vector - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector - mst_edge; // mst output - true if the edge belongs in mst + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_vector mst_edge; // mst output - true if the edge belongs in mst rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_vector color; // index of color that vertex points to // new src-dst pairs found per iteration rmm::device_vector temp_src; @@ -93,8 +96,7 @@ class MST_solver { void check_termination(); void alteration(); alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, - weight_t* mst_weights); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); }; } // namespace mst diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 562d506cfe..397fecaaea 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -42,15 +42,23 @@ namespace sparse { namespace op { template -__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, - const T *vals, int nnz, int *crows, - int *ccols, T *cvals, int *ex_scan, - int *cur_ex_scan, int m, T scalar) { +__global__ void coo_remove_scalar_kernel(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* ex_scan, + int* cur_ex_scan, + int m, + T scalar) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { - int start = cur_ex_scan[row]; - int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); int cur_out_idx = ex_scan[row]; for (int idx = start; idx < stop; idx++) { @@ -82,37 +90,51 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, * @param stream: cuda stream to use */ template -void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, +void coo_remove_scalar(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* cnnz, + int* cur_cnnz, + T scalar, + int n, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer ex_scan(d_alloc, stream, n); raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); - thrust::device_ptr dev_ex_scan = - thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, - dev_ex_scan); + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = - thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, - dev_cur_cnnz + n, dev_cur_ex_scan); + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - coo_remove_scalar_kernel<<>>( - rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), - dev_cur_ex_scan.get(), n, scalar); + coo_remove_scalar_kernel<<>>(rows, + cols, + vals, + nnz, + crows, + ccols, + cvals, + dev_ex_scan.get(), + dev_cur_ex_scan.get(), + n, + scalar); CUDA_CHECK(cudaPeekAtLastError()); } @@ -126,35 +148,44 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, +void coo_remove_scalar(COO* in, + COO* out, + T scalar, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); - CUDA_CHECK( - cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, - row_count_nz.data(), stream); + linalg::coo_degree_scalar( + in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr d_row_count_nz = - thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, - d_row_count_nz + in->n_rows); + thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = + thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); - coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, - out->rows(), out->cols(), out->vals(), - row_count_nz.data(), row_count.data(), scalar, - in->n_rows, d_alloc, stream); + coo_remove_scalar(in->rows(), + in->cols(), + in->vals(), + in->nnz, + out->rows(), + out->cols(), + out->vals(), + row_count_nz.data(), + row_count.data(), + scalar, + in->n_rows, + d_alloc, + stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -167,9 +198,11 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, +void coo_remove_zeros(COO* in, + COO* out, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ coo_remove_scalar(in, out, T(0.0), d_alloc, stream); } diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 53c9f89074..bc4d7bace5 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -46,25 +46,29 @@ namespace sparse { namespace op { template -__global__ void compute_duplicates_diffs_kernel(const value_idx *rows, - const value_idx *cols, - value_idx *diff, size_t nnz) { +__global__ void compute_duplicates_diffs_kernel(const value_idx* rows, + const value_idx* cols, + value_idx* diff, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; value_idx d = 1; - if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) - d = 0; + if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0; diff[tid] = d; } template -__global__ void max_duplicates_kernel(const value_idx *src_rows, - const value_idx *src_cols, - const value_t *src_vals, - const value_idx *index, - value_idx *out_rows, value_idx *out_cols, - value_t *out_vals, size_t nnz) { +__global__ void max_duplicates_kernel(const value_idx* src_rows, + const value_idx* src_cols, + const value_t* src_vals, + const value_idx* index, + value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < nnz) { @@ -96,13 +100,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void compute_duplicates_mask(value_idx *mask, const value_idx *rows, - const value_idx *cols, size_t nnz, - cudaStream_t stream) { +void compute_duplicates_mask( + value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) +{ CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); - compute_duplicates_diffs_kernel<<>>(rows, cols, mask, nnz); + compute_duplicates_diffs_kernel<<>>( + rows, cols, mask, nnz); } /** @@ -122,12 +126,17 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void max_duplicates(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_idx *rows, const value_idx *cols, - const value_t *vals, size_t nnz, size_t m, size_t n) { +void max_duplicates(const raft::handle_t& handle, + raft::sparse::COO& out, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t nnz, + size_t m, + size_t n) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); @@ -136,8 +145,8 @@ void max_duplicates(const raft::handle_t &handle, compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), - diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), diff.data(), diff.data() + diff.size(), diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index 9e5034dc28..194a878ac1 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -38,12 +38,12 @@ namespace sparse { namespace op { template void> -__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, - Lambda op) { +__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) +{ T row = blockIdx.x * TPB_X + threadIdx.x; if (row < n_rows) { T start_idx = row_ind[row]; - T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; op(row, start_idx, stop_idx); } } @@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template void> -void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, - cudaStream_t stream) { +template void> +void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) +{ dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel - <<>>(row_ind, n_rows, nnz, op); + csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h index 46f4f41879..9bbe04cf34 100644 --- a/cpp/include/raft/sparse/op/slice.h +++ b/cpp/include/raft/sparse/op/slice.h @@ -50,10 +50,14 @@ namespace op { * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, - const value_idx *indptr, value_idx *indptr_out, - value_idx *start_offset, value_idx *stop_offset, - cudaStream_t stream) { +void csr_row_slice_indptr(value_idx start_row, + value_idx stop_row, + const value_idx* indptr, + value_idx* indptr_out, + value_idx* start_offset, + value_idx* stop_offset, + cudaStream_t stream) +{ raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); @@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, // we add another 1 to stop row. - raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, - stream); + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream); raft::linalg::unaryOp( - indptr_out, indptr_out, (stop_row + 2) - start_row, + indptr_out, + indptr_out, + (stop_row + 2) - start_row, [s_offset] __device__(value_idx input) { return input - s_offset; }, stream); } @@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, - const value_idx *indices, const value_t *data, - value_idx *indices_out, value_t *data_out, - cudaStream_t stream) { - raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, - stream); +void csr_row_slice_populate(value_idx start_offset, + value_idx stop_offset, + const value_idx* indices, + const value_t* data, + value_idx* indices_out, + value_t* data_out, + cudaStream_t stream) +{ + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream); raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); } diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 9dbe2b67c5..3cab24fc09 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -42,7 +42,8 @@ namespace op { struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -66,15 +67,21 @@ struct TupleComp { * @param stream: cuda stream to use */ template -void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, +void coo_sort(int m, + int n, + int nnz, + int* rows, + int* cols, + T* vals, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, - coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key( + thrust::cuda::par.on(stream), coo_indices, coo_indices + nnz, vals, TupleComp()); } /** @@ -85,12 +92,12 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, +void coo_sort(COO* const in, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) { - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), d_alloc, stream); + cudaStream_t stream) +{ + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), d_alloc, stream); } /** @@ -104,16 +111,16 @@ void coo_sort(COO *const in, * @param[in] stream cuda stream for which to order cuda operations */ template -void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, - value_idx nnz, cudaStream_t stream) { +void coo_sort_by_weight( + value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) +{ thrust::device_ptr t_rows = thrust::device_pointer_cast(rows); thrust::device_ptr t_cols = thrust::device_pointer_cast(cols); - thrust::device_ptr t_data = thrust::device_pointer_cast(data); + thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, - first); + thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 8aae90f1d8..ec8bec6eb3 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -59,17 +59,20 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructor - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const &key, - Value const &value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -83,31 +86,32 @@ struct KeyValuePair { */ template struct FixConnectivitiesRedOp { - value_idx *colors; + value_idx* colors; value_idx m; - FixConnectivitiesRedOp(value_idx *colors_, value_idx m_) - : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; typedef typename cub::KeyValuePair KVP; - DI void operator()(value_idx rit, KVP *out, const KVP &other) { - if (rit < m && other.value < out->value && - colors[rit] != colors[other.key]) { - out->key = other.key; + DI void operator()(value_idx rit, KVP* out, const KVP& other) + { + if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { + out->key = other.key; out->value = other.value; } } - DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) { + DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) + { if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { return a; } else return b; } - DI void init(value_t *out, value_t maxVal) { *out = maxVal; } - DI void init(KVP *out, value_t maxVal) { - out->key = -1; + DI void init(value_t* out, value_t maxVal) { *out = maxVal; } + DI void init(KVP* out, value_t maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp { */ struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -137,13 +142,9 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce @@ -158,13 +159,14 @@ struct CubKVPMinReduce { * @return total number of components */ template -value_idx get_n_components(value_idx *colors, size_t n_rows, +value_idx get_n_components(value_idx* colors, + size_t n_rows, std::shared_ptr d_alloc, - cudaStream_t stream) { - value_idx *map_ids; + cudaStream_t stream) +{ + value_idx* map_ids; int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, - d_alloc); + raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, d_alloc); d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); return num_clusters; @@ -177,11 +179,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows, */ template struct LookupColorOp { - value_idx *colors; + value_idx* colors; - LookupColorOp(value_idx *colors_) : colors(colors_) {} + LookupColorOp(value_idx* colors_) : colors(colors_) {} - DI value_idx operator()(const cub::KeyValuePair &kvp) { + DI value_idx operator()(const cub::KeyValuePair& kvp) + { return colors[kvp.key]; } }; @@ -191,7 +194,8 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t - * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components + * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given + * array of components * @param[out] nn_colors components of nearest neighbors for each vertex * @param[in] colors components of each vertex * @param[in] X original dense data @@ -201,25 +205,39 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair *kvp, - value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, +void perform_1nn(cub::KeyValuePair* kvp, + value_idx* nn_colors, + value_idx* colors, + const value_t* X, + size_t n_rows, + size_t n_cols, std::shared_ptr d_alloc, - cudaStream_t stream, red_op reduction_op) { + cudaStream_t stream, + red_op reduction_op) +{ rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, - true, stream); - - raft::distance::fusedL2NN, - value_idx>( - kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols, - workspace.data(), reduction_op, reduction_op, true, true, stream); + raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); + + raft::distance::fusedL2NN, value_idx>( + kvp, + X, + X, + x_norm.data(), + x_norm.data(), + n_rows, + n_rows, + n_cols, + workspace.data(), + reduction_op, + reduction_op, + true, + true, + stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, - extract_colors_op); + thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } /** @@ -235,27 +253,33 @@ void perform_1nn(cub::KeyValuePair *kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx *colors, value_idx *nn_colors, - cub::KeyValuePair *kvp, - value_idx *src_indices, size_t n_rows, cudaStream_t stream) { +void sort_by_color(value_idx* colors, + value_idx* nn_colors, + cub::KeyValuePair* kvp, + value_idx* src_indices, + size_t n_rows, + cudaStream_t stream) +{ thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, - arg_sort_iter + n_rows, src_indices); + thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); - auto keys = thrust::make_zip_iterator(thrust::make_tuple( - colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair*)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, - TupleComp()); + thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, TupleComp()); } template -__global__ void min_components_by_color_kernel( - value_idx *out_rows, value_idx *out_cols, value_t *out_vals, - const value_idx *out_index, const value_idx *indices, - const cub::KeyValuePair *kvp, size_t nnz) { +__global__ void min_components_by_color_kernel(value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -284,19 +308,20 @@ __global__ void min_components_by_color_kernel( * @param[in] stream cuda stream for which to order cuda operations */ template -void min_components_by_color(raft::sparse::COO &coo, - const value_idx *out_index, - const value_idx *indices, - const cub::KeyValuePair *kvp, - size_t nnz, cudaStream_t stream) { +void min_components_by_color(raft::sparse::COO& coo, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz, + cudaStream_t stream) +{ /** * Arrays should be ordered by: colors_indptr->colors_n->kvp.value * so the last element of each column in the input CSR should be * the min. */ - min_components_by_color_kernel<<>>(coo.rows(), coo.cols(), coo.vals(), - out_index, indices, kvp, nnz); + min_components_by_color_kernel<<>>( + coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz); } /** @@ -318,14 +343,18 @@ void min_components_by_color(raft::sparse::COO &coo, * @param[in] n_cols number of cols in X */ template -void connect_components(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_t *X, const value_idx *orig_colors, - size_t n_rows, size_t n_cols, red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_components( + const raft::handle_t& handle, + raft::sparse::COO& out, + const value_t* X, + const value_idx* orig_colors, + size_t n_rows, + size_t n_cols, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Fixing connectivities for an unconnected k-NN graph only " @@ -335,47 +364,52 @@ void connect_components(const raft::handle_t &handle, raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - d_alloc, true); + raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, d_alloc, true); - value_idx n_components = - get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = get_n_components(colors.data(), n_rows, d_alloc, stream); /** * First compute 1-nn for all colors where the color of each data point * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists( - n_rows, stream); + rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, d_alloc, stream, reduction_op); + perform_1nn(temp_inds_dists.data(), + nn_colors.data(), + colors.data(), + X, + n_rows, + n_cols, + d_alloc, + stream, + reduction_op); /** * Sort data points by color (neighbors are not sorted) */ // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors - sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(), - src_indices.data(), n_rows, stream); + sort_by_color( + colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); /** * Take the min for any duplicate colors */ // Compute mask of duplicates rmm::device_uvector out_index(n_rows + 1, stream); - raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), - nn_colors.data(), n_rows, stream); + raft::sparse::op::compute_duplicates_mask( + out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), - out_index.data() + out_index.size(), out_index.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), + out_index.data(), + out_index.data() + out_index.size(), + out_index.data()); // compute final size value_idx size = 0; - raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, - stream); + raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size++; @@ -383,14 +417,14 @@ void connect_components(const raft::handle_t &handle, raft::sparse::COO min_edges(d_alloc, stream); min_edges.allocate(size, n_rows, n_rows, true, stream); - min_components_by_color(min_edges, out_index.data(), src_indices.data(), - temp_inds_dists.data(), n_rows, stream); + min_components_by_color( + min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream); /** * Symmetrize resulting edge list */ - raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(), - min_edges.vals(), n_rows, n_rows, size, out); + raft::sparse::linalg::symmetrize( + handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); } }; // end namespace linkage diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index 71fbb8ab3d..dbb24ee334 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -49,9 +49,11 @@ namespace selection { template struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, value_idx n_rows, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data) + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) : batch_start_(0), batch_stop_(0), batch_rows_(0), @@ -61,32 +63,42 @@ struct csr_batcher_t { csr_indices_(csr_indices), csr_data_(csr_data), batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) {} + batch_csr_stop_offset_(0) + { + } - void set_batch(int batch_num) { + void set_batch(int batch_num) + { batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - if (batch_stop_ >= total_rows_) - batch_stop_ = total_rows_ - 1; // zero-based indexing + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing batch_rows_ = (batch_stop_ - batch_start_) + 1; } - value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_indptr( - batch_start_, batch_stop_, csr_indptr_, batch_indptr, - &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); return batch_csr_stop_offset_ - batch_csr_start_offset_; } - void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_populate( - batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, - csr_indices, csr_data, stream); + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); } value_idx batch_rows() const { return batch_rows_; } @@ -103,9 +115,9 @@ struct csr_batcher_t { value_idx total_rows_; - const value_idx *csr_indptr_; - const value_idx *csr_indices_; - const value_t *csr_data_; + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; value_idx batch_csr_start_offset_; value_idx batch_csr_stop_offset_; @@ -114,18 +126,26 @@ struct csr_batcher_t { template class sparse_knn_t { public: - sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, - const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, - int n_idx_cols_, const value_idx *queryIndptr_, - const value_idx *queryIndices_, const value_t *queryData_, - size_t queryNNZ_, int n_query_rows_, int n_query_cols_, - value_idx *output_indices_, value_t *output_dists_, int k_, - const raft::handle_t &handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = - raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + const raft::handle_t& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0) : idxIndptr(idxIndptr_), idxIndices(idxIndices_), idxData(idxData_), @@ -145,9 +165,12 @@ class sparse_knn_t { batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), metric(metric_), - metricArg(metricArg_) {} + metricArg(metricArg_) + { + } - void run() { + void run() + { using namespace raft::sparse; int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); @@ -158,37 +181,33 @@ class sparse_knn_t { for (int i = 0; i < n_batches_query; i++) { /** - * Compute index batch info - */ + * Compute index batch info + */ query_batcher.set_batch(i); /** - * Slice CSR to rows in batch - */ + * Slice CSR to rows in batch + */ - rmm::device_uvector query_batch_indptr( - query_batcher.batch_rows() + 1, handle.get_stream()); + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + handle.get_stream()); - value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( - query_batch_indptr.data(), handle.get_stream()); + value_idx n_query_batch_nnz = + query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream()); - rmm::device_uvector query_batch_indices(n_query_batch_nnz, - handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, - handle.get_stream()); + rmm::device_uvector query_batch_indices(n_query_batch_nnz, handle.get_stream()); + rmm::device_uvector query_batch_data(n_query_batch_nnz, handle.get_stream()); - query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), - query_batch_data.data(), - handle.get_stream()); + query_batcher.get_batch_csr_indices_data( + query_batch_indices.data(), query_batch_data.data(), handle.get_stream()); // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, - handle.get_stream()); + rmm::device_uvector merge_buffer_indices(0, handle.get_stream()); rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - value_t *dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_ptr; + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); csr_batcher_t idx_batcher( @@ -197,22 +216,19 @@ class sparse_knn_t { for (int j = 0; j < n_batches_idx; j++) { idx_batcher.set_batch(j); - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr( - idx_batcher.batch_rows() + 1, handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, - handle.get_stream()); + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + handle.get_stream()); + rmm::device_uvector idx_batch_indices(0, handle.get_stream()); rmm::device_uvector idx_batch_data(0, handle.get_stream()); - value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( - idx_batch_indptr.data(), handle.get_stream()); + value_idx idx_batch_nnz = + idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream()); idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); @@ -221,111 +237,126 @@ class sparse_knn_t { idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); /** - * Compute distances - */ - size_t dense_size = - idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, - handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, - batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, query_batcher, idx_batch_nnz, - n_query_batch_nnz, idx_batch_indptr.data(), - idx_batch_indices.data(), idx_batch_data.data(), - query_batch_indptr.data(), query_batch_indices.data(), - query_batch_data.data(), batch_dists.data()); + * Compute distances + */ + size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, handle.get_stream()); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), - handle.get_stream()); + rmm::device_uvector batch_indices(batch_dists.size(), handle.get_stream()); // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); - iota_fill(batch_indices.data(), batch_rows, batch_cols, - handle.get_stream()); + iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream()); /** * Perform k-selection on batch & merge with other k-selections */ size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = - merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = - merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), - batch_indices.data(), dists_merge_buffer_ptr, + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, indices_merge_buffer_ptr); - value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; // Merge results of difference batches if necessary if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = - merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = - merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), - merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, indices_merge_buffer_tmp_ptr); } // copy merged output back into merge buffer partition for next iteration raft::copy_async(merge_buffer_indices.data(), indices_merge_buffer_tmp_ptr, - batch_rows * k, handle.get_stream()); + batch_rows * k, + handle.get_stream()); raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, batch_rows * k, + dists_merge_buffer_tmp_ptr, + batch_rows * k, handle.get_stream()); } // Copy final merged batch to output array - raft::copy_async( - output_indices + (rows_processed * k), merge_buffer_indices.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - raft::copy_async( - output_dists + (rows_processed * k), merge_buffer_dists.data(), - query_batcher.batch_rows() * k, handle.get_stream()); + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); rows_processed += query_batcher.batch_rows(); } } private: - void merge_batches(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - value_t *merge_buffer_dists, - value_idx *merge_buffer_indices, value_t *out_dists, - value_idx *out_indices) { + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { // build translation buffer to shift resulting indices by the batch std::vector id_ranges; id_ranges.push_back(0); id_ranges.push_back(idx_batcher.batch_start()); rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), - handle.get_stream()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream()); // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts( - merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, - query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); + raft::spatial::knn::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + handle.get_stream(), + trans.data()); } void perform_k_selection(csr_batcher_t idx_batcher, csr_batcher_t query_batcher, - value_t *batch_dists, value_idx *batch_indices, - value_t *out_dists, value_idx *out_indices) { + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); // build translation buffer to shift resulting indices by the batch std::vector id_ranges; @@ -340,51 +371,60 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists, - out_indices, ascending, n_neighbors, handle.get_stream()); + select_k(batch_dists, + batch_indices, + batch_rows, + batch_cols, + out_dists, + out_indices, + ascending, + n_neighbors, + handle.get_stream()); } - void compute_distances(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - size_t idx_batch_nnz, size_t query_batch_nnz, - value_idx *idx_batch_indptr, - value_idx *idx_batch_indices, value_t *idx_batch_data, - value_idx *query_batch_indptr, - value_idx *query_batch_indices, - value_t *query_batch_data, value_t *batch_dists) { + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config( - handle); + raft::sparse::distance::distances_config_t dist_config(handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; + dist_config.b_nnz = idx_batch_nnz; - dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indptr = idx_batch_indptr; dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; + dist_config.b_data = idx_batch_data; dist_config.a_nrows = query_batcher.batch_rows(); dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; + dist_config.a_nnz = query_batch_nnz; - dist_config.a_indptr = query_batch_indptr; + dist_config.a_indptr = query_batch_indptr; dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; + dist_config.a_data = query_batch_data; if (raft::sparse::distance::supportedDistance.find(metric) == raft::sparse::distance::supportedDistance.end()) THROW("DistanceType not supported: %d", metric); - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, - metricArg); + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); } const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx *output_indices; + value_idx* output_indices; const value_t *idxData, *queryData; - value_t *output_dists; + value_t* output_dists; size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; @@ -394,52 +434,76 @@ class sparse_knn_t { int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - const raft::handle_t &handle; + const raft::handle_t& handle; }; /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] cusparseHandle the initialized cusparseHandle instance to use + * @param[in] allocator device allocator instance to use + * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ template -void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, - const value_t *idxData, size_t idxNNZ, int n_idx_rows, - int n_idx_cols, const value_idx *queryIndptr, - const value_idx *queryIndices, const value_t *queryData, - size_t queryNNZ, int n_query_rows, int n_query_cols, - value_idx *output_indices, value_t *output_dists, int k, - const raft::handle_t &handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - sparse_knn_t( - idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, - queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, handle, batch_size_index, batch_size_query, - metric, metricArg) +void brute_force_knn(const value_idx* idxIndptr, + const value_idx* idxIndices, + const value_t* idxData, + size_t idxNNZ, + int n_idx_rows, + int n_idx_cols, + const value_idx* queryIndptr, + const value_idx* queryIndices, + const value_t* queryData, + size_t queryNNZ, + int n_query_rows, + int n_query_cols, + value_idx* output_indices, + value_t* output_dists, + int k, + const raft::handle_t& handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + sparse_knn_t(idxIndptr, + idxIndices, + idxData, + idxNNZ, + n_idx_rows, + n_idx_cols, + queryIndptr, + queryIndices, + queryData, + queryNNZ, + n_query_rows, + n_query_cols, + output_indices, + output_dists, + k, + handle, + batch_size_index, + batch_size_query, + metric, + metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cf225087a..1308f5ce02 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -45,31 +45,34 @@ namespace selection { * @param m */ template -__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) { +__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz) +{ value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; - value_idx v = tid / m; + value_idx v = tid / m; indices[tid] = v; } template -value_idx build_k(value_idx n_samples, int c) { +value_idx build_k(value_idx n_samples, int c) +{ // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering // approach on GPU" - return min(n_samples, - max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); + return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); } template -__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) { +__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; - out_t v = inds[tid]; + out_t v = inds[tid]; out[tid] = v; } template -void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { +void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) +{ size_t blocks = ceildiv(size, (size_t)tpb); conv_indices_kernel<<>>(inds, out, size); } @@ -91,13 +94,18 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { * @param c */ template -void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, +void knn_graph(const handle_t& handle, + const value_t* X, + size_t m, + size_t n, distance::DistanceType metric, - raft::sparse::COO &out, int c = 15) { + raft::sparse::COO& out, + int c = 15) +{ int k = build_k(m, c); auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); size_t nnz = m * k; @@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, size_t blocks = ceildiv(nnz, (size_t)256); fill_indices<<>>(rows.data(), k, nnz); - std::vector inputs; - inputs.push_back(const_cast(X)); + std::vector inputs; + inputs.push_back(const_cast(X)); std::vector sizes; sizes.push_back(m); @@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, rmm::device_uvector int64_indices(nnz, stream); uint32_t knn_start = curTimeMillis(); - raft::spatial::knn::brute_force_knn( - handle, inputs, sizes, n, const_cast(X), m, int64_indices.data(), - data.data(), k, true, true, nullptr, metric); + raft::spatial::knn::brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(X), + m, + int64_indices.data(), + data.data(), + k, + true, + true, + nullptr, + metric); // convert from current knn's 64-bit to 32-bit. conv_indices(int64_indices.data(), indices.data(), nnz, stream); - raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(), - data.data(), m, k, nnz, out); + raft::sparse::linalg::symmetrize( + handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); } }; // namespace selection diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh index 6066a36289..190e06b2cd 100644 --- a/cpp/include/raft/sparse/selection/selection.cuh +++ b/cpp/include/raft/sparse/selection/selection.cuh @@ -39,27 +39,33 @@ namespace raft { namespace sparse { namespace selection { -template -__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, - size_t n_cols, K *outK, IndexType *outV, - K initK, IndexType initV, int k) { +template +__global__ void select_k_kernel(K* inK, + IndexType* inV, + size_t n_rows, + size_t n_cols, + K* outK, + IndexType* outV, + K initK, + IndexType initV, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ K smemK[kNumWarps * warp_q]; __shared__ IndexType smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, - warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; - int i = threadIdx.x; + int i = threadIdx.x; - int idx = row * n_cols; - K *inKStart = inK + idx + i; - IndexType *inVStart = inV + idx + i; + int idx = row * n_cols; + K* inKStart = inK + idx + i; + IndexType* inVStart = inV + idx + i; // Whole warps must participate in the selection int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); @@ -86,27 +92,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, } } -template -inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, - size_t n_cols, value_t *outK, value_idx *outV, - bool select_min, int k, cudaStream_t stream) { +template +inline void select_k_impl(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ auto grid = dim3(n_rows); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); - auto kInit = select_min ? faiss::gpu::Limits::getMax() - : faiss::gpu::Limits::getMin(); + auto kInit = + select_min ? faiss::gpu::Limits::getMax() : faiss::gpu::Limits::getMin(); auto vInit = -1; if (select_min) { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } else { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } CUDA_CHECK(cudaGetLastError()); } @@ -126,30 +136,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, * @param[in] stream CUDA stream to use */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ if (k == 1) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 32) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 64) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 128) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 256) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 512) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 1024) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } }; // namespace selection diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h index 63578bf1f3..56e8832e0a 100644 --- a/cpp/include/raft/sparse/utils.h +++ b/cpp/include/raft/sparse/utils.h @@ -26,7 +26,8 @@ namespace sparse { * @param[in] ncols number of blocks to quantize */ template -inline int block_dim(value_idx ncols) { +inline int block_dim(value_idx ncols) +{ int blockdim; if (ncols <= 32) blockdim = 32; @@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) { * @return */ template -__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, - G key) { - unsigned int mask = __ballot_sync(init_mask, true); +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key) +{ + unsigned int mask = __ballot_sync(init_mask, true); unsigned int peer_group = 0; bool is_peer; @@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, } #endif -__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ return __ffs(peer_group) - 1; } template -__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { +__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) +{ int row = blockIdx.x; int tid = threadIdx.x; @@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { } template -void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, - cudaStream_t stream) { +void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream) +{ int blockdim = block_dim(ncols); iota_fill_block_kernel<<>>(indices, ncols); } template -__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { +__device__ int get_stop_idx(T row, T m, T nnz, const T* ind) +{ int stop_idx = 0; if (row < (m - 1)) stop_idx = ind[row + 1]; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 77d7831b4a..f77a56164d 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -45,14 +45,16 @@ using deviceAllocator = raft::mr::device::allocator; * @param[in] D the dimensionality of the index array */ template -inline void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - knnIndexParam *params, +inline void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, - value_idx n, value_idx D) { - detail::approx_knn_build_index(handle, index, params, metric, metricArg, - index_array, n, D); + float metricArg, + float* index_array, + value_idx n, + value_idx D) +{ + detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } /** @@ -69,12 +71,15 @@ inline void approx_knn_build_index(raft::handle_t &handle, * @param[in] n number of rows in the query array */ template -inline void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, - raft::spatial::knn::knnIndex *index, value_idx k, - float *query_array, value_idx n) { - detail::approx_knn_search(handle, distances, indices, index, k, query_array, - n); +inline void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + value_idx k, + float* query_array, + value_idx n) +{ + detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } } // namespace knn diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 6a6c7751c2..573a23181d 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -26,13 +26,14 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex *index; + faiss::gpu::GpuIndex* index; raft::distance::DistanceType metric; float metricArg; - faiss::gpu::StandardGpuResources *gpu_res; + faiss::gpu::StandardGpuResources* gpu_res; int device; - ~knnIndex() { + ~knnIndex() + { delete index; delete gpu_res; } @@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam { int nprobe; }; -struct IVFFlatParam : IVFParam {}; +struct IVFFlatParam : IVFParam { +}; struct IVFPQParam : IVFParam { int M; diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6e4c99b646..7eb439c78b 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -56,115 +56,107 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype( - QuantizerType qtype) { +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) +{ switch (qtype) { - case QuantizerType::QT_8bit: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case QuantizerType::QT_8bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; case QuantizerType::QT_4bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: - return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; case QuantizerType::QT_8bit_direct: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: - return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: - return (faiss::ScalarQuantizer::QuantizerType)qtype; + case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: return (faiss::ScalarQuantizer::QuantizerType)qtype; } } template -void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfflat_build_index( + knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; + config.device = index->device; faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat( - index->gpu_res, D, params->nlist, faiss_metric, config); + faiss::gpu::GpuIndexIVFFlat* faiss_index = + new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfpq_build_index( + knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params->usePrecomputedTables; - config.interleavedLayout = params->n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFPQ *faiss_index = - new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M, - params->n_bits, faiss_metric, config); + config.device = index->device; + config.usePrecomputedTables = params->usePrecomputedTables; + config.interleavedLayout = params->n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ( + index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfsq_build_index( + knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = - build_faiss_qtype(params->qtype); - faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index = - new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist, - faiss_qtype, faiss_metric, - params->encodeResidual); + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params->qtype); + faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer( + index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - raft::spatial::knn::knnIndexParam *params, +void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + raft::spatial::knn::knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, IntType n, - IntType D) { + float metricArg, + float* index_array, + IntType n, + IntType D) +{ int device; CUDA_CHECK(cudaGetDevice(&device)); - faiss::gpu::StandardGpuResources *gpu_res = - new faiss::gpu::StandardGpuResources(); + faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); gpu_res->setDefaultStream(device, handle.get_stream()); - index->gpu_res = gpu_res; - index->device = device; - index->index = nullptr; - index->metric = metric; + index->gpu_res = gpu_res; + index->device = device; + index->index = nullptr; + index->metric = metric; index->metricArg = metricArg; // perform preprocessing // k set to 0 (unused during preprocessing / revertion) - std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, 0, false, handle.get_stream(), - handle.get_device_allocator()); + std::unique_ptr> query_metric_processor = create_processor( + metric, n, D, 0, false, handle.get_stream(), handle.get_device_allocator()); query_metric_processor->preprocess(index_array); - if (dynamic_cast(params)) { - IVFFlatParam *IVFFlat_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFFlatParam* IVFFlat_param = dynamic_cast(params); approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D); std::vector h_index_array(n * D); - raft::update_host(h_index_array.data(), index_array, h_index_array.size(), - handle.get_stream()); + raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream()); query_metric_processor->revert(index_array); index->index->train(n, h_index_array.data()); index->index->add(n, h_index_array.data()); } else { - if (dynamic_cast(params)) { - IVFPQParam *IVFPQ_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFPQParam* IVFPQ_param = dynamic_cast(params); approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D); - } else if (dynamic_cast(params)) { - IVFSQParam *IVFSQ_param = dynamic_cast(params); + } else if (dynamic_cast(params)) { + IVFSQParam* IVFSQ_param = dynamic_cast(params); approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D); } else { ASSERT(index->index, "KNN index could not be initialized"); @@ -177,13 +169,23 @@ void approx_knn_build_index(raft::handle_t &handle, } template -void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, raft::spatial::knn::knnIndex *index, - IntType k, float *query_array, IntType n) { +void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + IntType k, + float* query_array, + IntType n) +{ // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream(), handle.get_device_allocator()); + create_processor(index->metric, + n, + index->index->d, + k, + false, + handle.get_stream(), + handle.get_device_allocator()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); @@ -194,13 +196,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances, index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || index->metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (index->metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / index->metricArg; + if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; raft::linalg::unaryOp( - distances, distances, n * k, + distances, + distances, + n * k, [p] __device__(float input) { return powf(input, p); }, handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 0c0398a336..5618186dfc 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -27,37 +27,26 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::MetricType build_faiss_metric( - raft::distance::DistanceType metric) { +inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric) +{ switch (metric) { case raft::distance::DistanceType::CosineExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; case raft::distance::DistanceType::CorrelationExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::L2Expanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2Unexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtExpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtUnexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L1: - return faiss::MetricType::METRIC_L1; - case raft::distance::DistanceType::InnerProduct: - return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::LpUnexpanded: - return faiss::MetricType::METRIC_Lp; - case raft::distance::DistanceType::Linf: - return faiss::MetricType::METRIC_Linf; - case raft::distance::DistanceType::Canberra: - return faiss::MetricType::METRIC_Canberra; - case raft::distance::DistanceType::BrayCurtis: - return faiss::MetricType::METRIC_BrayCurtis; + case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; + case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT; + case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; + case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; + case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; + case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis; case raft::distance::DistanceType::JensenShannon: return faiss::MetricType::METRIC_JensenShannon; - default: - THROW("MetricType not supported: %d", metric); + default: THROW("MetricType not supported: %d", metric); } } diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 7d87254cb6..049c11514c 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -35,7 +35,8 @@ namespace knn { namespace detail { template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { +DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) +{ value_t sin_0 = sin(0.5 * (x1 - y1)); value_t sin_1 = sin(0.5 * (x2 - y2)); value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; @@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { * @param[in] n_index_rows number of rows in index array * @param[in] k number of closest neighbors to return */ -template -__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, int k) { +template +__global__ void haversine_knn_kernel(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - const value_t *query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; + const value_t* query_ptr = query + (blockIdx.x * 2); + value_t x1 = query_ptr[0]; + value_t x2 = query_ptr[1]; int i = threadIdx.x; for (; i < limit; i += tpb) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, // Handle last remainder fraction of a warp of elements if (i < n_index_rows) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; + out_inds[blockIdx.x * k + i] = smemV[i]; } } @@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, * @param[in] stream stream to order kernel launch */ template -void haversine_knn(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, size_t n_query_rows, int k, - cudaStream_t stream) { +void haversine_knn(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + cudaStream_t stream) +{ haversine_knn_kernel<<>>( out_inds, out_dists, index, query, n_index_rows, k); } diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 09494e9eb1..a276ae45ad 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -43,13 +43,18 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, - value_t *outK, value_idx *outV, - size_t n_samples, int n_parts, - value_t initK, value_idx initV, int k, - value_idx *translations) { +template +__global__ void knn_merge_parts_kernel(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + value_t initK, + value_idx initV, + int k, + value_idx* translations) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; @@ -58,34 +63,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, /** * Uses shared memory */ - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available - int row = blockIdx.x; + int row = blockIdx.x; int total_k = k * n_parts; int i = threadIdx.x; // Get starting pointers for cols in current thread - int part = i / k; + int part = i / k; size_t row_idx = (row * k) + (part * n_samples * k); int col = i % k; - value_t *inKStart = inK + (row_idx + col); - value_idx *inVStart = inV + (row_idx + col); + value_t* inKStart = inK + (row_idx + col); + value_idx* inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); value_idx translation = 0; for (; i < limit; i += tpb) { translation = translations[part]; heap.add(*inKStart, (*inVStart) + translation); - part = (i + tpb) / k; + part = (i + tpb) / k; row_idx = (row * k) + (part * n_samples * k); col = (i + tpb) % k; @@ -108,22 +112,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, } } -template -inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +template +inline void knn_merge_parts_impl(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ auto grid = dim3(n_samples); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); auto kInit = faiss::gpu::Limits::getMax(); auto vInit = -1; knn_merge_parts_kernel - <<>>(inK, inV, outK, outV, n_samples, n_parts, - kInit, vInit, k, translations); + <<>>( + inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); CUDA_CHECK(cudaPeekAtLastError()); } @@ -142,10 +151,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ if (k == 1) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); @@ -195,27 +210,33 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm */ template -void brute_force_knn_impl(std::vector &input, std::vector &sizes, - IntType D, float *search_items, IntType n, - int64_t *res_I, float *res_D, IntType k, - std::shared_ptr allocator, - cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors should be the same size"); - - std::vector *id_ranges; +void brute_force_knn_impl( + std::vector& input, + std::vector& sizes, + IntType D, + float* search_items, + IntType n, + int64_t* res_I, + float* res_D, + IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, + cudaStream_t* internalStreams = nullptr, + int n_int_streams = 0, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); + + std::vector* id_ranges; if (translations == nullptr) { // If we don't have explicit translations // for offsets of the indices, build them // from the local partitions - id_ranges = new std::vector(); + id_ranges = new std::vector(); int64_t total_n = 0; for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); @@ -228,31 +249,27 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream, allocator); query_metric_processor->preprocess(search_items); - std::vector>> metric_processors( - input.size()); + std::vector>> metric_processors(input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = + create_processor(metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); metric_processors[i]->preprocess(input[i]); } int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, - id_ranges->size()); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), - userStream); + raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); raft::mr::device::buffer all_D(allocator, userStream, 0); raft::mr::device::buffer all_I(allocator, userStream, 0); - float *out_D = res_D; - int64_t *out_I = res_I; + float* out_D = res_D; + int64_t* out_I = res_I; if (input.size() > 1) { all_D.resize(input.size() * k * n, userStream); @@ -266,11 +283,10 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { - float *out_d_ptr = out_D + (i * k * n); - int64_t *out_i_ptr = out_I + (i * k * n); + float* out_d_ptr = out_D + (i * k * n); + int64_t* out_i_ptr = out_I + (i * k * n); - cudaStream_t stream = - raft::select_stream(userStream, internalStreams, n_int_streams, i); + cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); switch (metric) { case raft::distance::DistanceType::Haversine: @@ -279,8 +295,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, "Haversine distance requires 2 dimensions " "(latitude / longitude)."); - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, - k, stream); + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); break; default: faiss::MetricType m = build_faiss_metric(metric); @@ -291,18 +306,18 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, gpu_res.setDefaultStream(device, stream); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; + args.numVectors = sizes[i]; + args.queries = search_items; args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; /** * @todo: Until FAISS supports pluggable allocation strategies, @@ -325,8 +340,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); } // Perform necessary post-processing @@ -334,14 +348,12 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / metricArg; + if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, - [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index a645412c2f..6e983d1f42 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -39,11 +39,11 @@ using deviceAllocator = raft::mr::device::allocator; template class MetricProcessor { public: - virtual void preprocess(math_t *data) {} + virtual void preprocess(math_t* data) {} - virtual void revert(math_t *data) {} + virtual void revert(math_t* data) {} - virtual void postprocess(math_t *data) {} + virtual void postprocess(math_t* data) {} virtual ~MetricProcessor() = default; }; @@ -60,7 +60,10 @@ class CosineMetricProcessor : public MetricProcessor { raft::mr::device::buffer colsums_; public: - CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, + CosineMetricProcessor(size_t n_rows, + size_t n_cols, + int k, + bool row_major, cudaStream_t stream, std::shared_ptr allocator) : device_allocator_(allocator), @@ -69,30 +72,51 @@ class CosineMetricProcessor : public MetricProcessor { n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), - k_(k) {} + k_(k) + { + } - void preprocess(math_t *data) { - raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, - raft::linalg::NormType::L2Norm, row_major_, stream_, + void preprocess(math_t* data) + { + raft::linalg::rowNorm(colsums_.data(), + data, + n_cols_, + n_rows_, + raft::linalg::NormType::L2Norm, + row_major_, + stream_, [] __device__(math_t in) { return sqrtf(in); }); raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, stream_); } - void revert(math_t *data) { + void revert(math_t* data) + { raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, stream_); } - void postprocess(math_t *data) { + void postprocess(math_t* data) + { raft::linalg::unaryOp( - data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, - stream_); + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); } ~CosineMetricProcessor() = default; @@ -103,43 +127,64 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { using cosine = CosineMetricProcessor; public: - CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream, + CorrelationMetricProcessor(size_t n_rows, + size_t n_cols, + int k, + bool row_major, + cudaStream_t stream, std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, - allocator), - means_(allocator, stream, n_rows) {} + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, allocator), + means_(allocator, stream, n_rows) + { + } - void preprocess(math_t *data) { + void preprocess(math_t* data) + { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; - raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, - (math_t)0.0, cosine::row_major_, true, + raft::linalg::reduce(means_.data(), + data, + cosine::n_cols_, + cosine::n_rows_, + (math_t)0.0, + cosine::row_major_, + true, cosine::stream_); raft::linalg::unaryOp( - means_.data(), means_.data(), cosine::n_rows_, + means_.data(), + means_.data(), + cosine::n_rows_, [=] __device__(math_t in) { return in * normalizer_const; }, cosine::stream_); - raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanCenter(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); CosineMetricProcessor::preprocess(data); } - void revert(math_t *data) { + void revert(math_t* data) + { CosineMetricProcessor::revert(data); - raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanAdd(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); } - void postprocess(math_t *data) { - CosineMetricProcessor::postprocess(data); - } + void postprocess(math_t* data) { CosineMetricProcessor::postprocess(data); } ~CorrelationMetricProcessor() = default; @@ -149,33 +194,36 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { template class DefaultMetricProcessor : public MetricProcessor { public: - void preprocess(math_t *data) {} + void preprocess(math_t* data) {} - void revert(math_t *data) {} + void revert(math_t* data) {} - void postprocess(math_t *data) {} + void postprocess(math_t* data) {} ~DefaultMetricProcessor() = default; }; template inline std::unique_ptr> create_processor( - distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream, std::shared_ptr allocator) { - MetricProcessor *mp = nullptr; + distance::DistanceType metric, + int n, + int D, + int k, + bool rowMajorQuery, + cudaStream_t userStream, + std::shared_ptr allocator) +{ + MetricProcessor* mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, - allocator); + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; case distance::DistanceType::CorrelationExpanded: - mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream, allocator); + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; - default: - mp = new DefaultMetricProcessor(); + default: mp = new DefaultMetricProcessor(); } return std::unique_ptr>(mp); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a3a1972c13..42ee11ba5b 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -28,12 +28,17 @@ namespace knn { using deviceAllocator = raft::mr::device::allocator; template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { - detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, - translations); +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ + detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); } /** @@ -59,23 +64,42 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -inline void brute_force_knn( - raft::handle_t const &handle, std::vector &input, - std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, - float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true, - std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - float metric_arg = 2.0f) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors must be the same size"); +inline void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + int64_t* res_I, + float* res_D, + int k, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + float metric_arg = 2.0f) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); std::vector int_streams = handle.get_internal_streams(); - detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_device_allocator(), - handle.get_stream(), int_streams.data(), - handle.get_num_internal_streams(), rowMajorIndex, - rowMajorQuery, translations, metric, metric_arg); + detail::brute_force_knn_impl(input, + sizes, + D, + search_items, + n, + res_I, + res_D, + k, + handle.get_device_allocator(), + handle.get_stream(), + int_streams.data(), + handle.get_num_internal_streams(), + rowMajorIndex, + rowMajorQuery, + translations, + metric, + metric_arg); } } // namespace knn diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 922ae7cfab..7032a0009e 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -24,8 +24,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct cluster_solver_config_t { size_type_t n_clusters; size_type_t maxIter; @@ -35,25 +34,37 @@ struct cluster_solver_config_t { unsigned long long seed{123456}; }; -template +template struct kmeans_solver_t { - explicit kmeans_solver_t(cluster_solver_config_t const& config) - : config_(config) {} + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } template - std::pair solve( - handle_t const& handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const { + std::pair solve(handle_t const& handle, + thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, + kmeans(handle, + t_exe_policy, + n_obs_vecs, + dim, + config_.n_clusters, + config_.tol, + config_.maxIter, + obs, + codes, + residual, + iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index e36dca2e0c..156b996586 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -23,8 +23,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct eigen_solver_config_t { size_type_t n_eigVecs; size_type_t maxIter; @@ -34,42 +33,59 @@ struct eigen_solver_config_t { bool reorthogonalize{false}; unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations }; -template +template struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config) - : config_(config) {} + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } - index_type_t solve_smallest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_smallest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, - eigVecs, config_.seed); + computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); return iters; } - index_type_t solve_largest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_largest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, eigVecs, + computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index fb05bff3e2..e0c3565b77 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -44,15 +44,15 @@ using namespace raft::linalg; // Useful grid settings // ========================================================= -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels // ========================================================= -/** +/** * @brief Compute distances between observation vectors and centroids * Block dimensions should be (warpSize, 1, * blockSize/warpSize). Ideally, the grid is large enough so there @@ -76,11 +76,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances( - index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) { +static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) +{ // Loop index index_type_t i; @@ -115,12 +117,10 @@ static __global__ void computeDistances( // Perform reduction on warp for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += - __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) - atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -135,8 +135,8 @@ static __global__ void computeDistances( } } -/** - * @brief Find closest centroid to observation vectors. +/** + * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -157,10 +157,12 @@ static __global__ void computeDistances( * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, index_type_t k, +static __global__ void minDistances(index_type_t n, + index_type_t k, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { + index_type_t* __restrict__ clusterSizes) +{ // Loop index index_type_t i, j; @@ -179,8 +181,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -195,8 +197,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, } } -/** - * @brief Check if newly computed distances are smaller than old distances. +/** + * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -219,7 +221,8 @@ static __global__ void minDistances2(index_type_t n, value_type_t* __restrict__ dists_old, const value_type_t* __restrict__ dists_new, index_type_t* __restrict__ codes_old, - index_type_t code_new) { + index_type_t code_new) +{ // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -244,7 +247,7 @@ static __global__ void minDistances2(index_type_t n, } } -/** +/** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. @@ -256,9 +259,11 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes( - index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { +static __global__ void computeClusterSizes(index_type_t n, + index_type_t k, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -266,8 +271,8 @@ static __global__ void computeClusterSizes( } } -/** - * @brief Divide rows of centroid matrix by cluster sizes. +/** + * @brief Divide rows of centroid matrix by cluster sizes. * Divides the ith column of the sum matrix by the size of the ith * cluster. If the sum matrix has been initialized so that the ith * row is the sum of all observation vectors in the ith cluster, @@ -288,9 +293,11 @@ static __global__ void computeClusterSizes( * column is the mean position of a cluster). */ template -static __global__ void divideCentroids( - index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) { +static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) +{ // Global indices index_type_t gidx, gidy; @@ -341,15 +348,17 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, + index_type_t n, + index_type_t d, + index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) { + value_type_t* __restrict__ centroid) +{ // Cumulative sum of distances value_type_t* distsCumSum = dists + n; // Residual sum of squares @@ -358,43 +367,43 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t obsIndex; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) // - //seg-faults due to Thrust bug - //on binary-search-like algorithms - //when run with stream dependent - //execution policies; fixed on Thrust GitHub - //hence replace w/ linear interpolation, - //until the Thrust issue gets resolved: + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: // // obsIndex = (thrust::lower_bound( // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - // thrust::device_pointer_cast(distsCumSum)); // - //linear interpolation logic: + // linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); } else { obsIndex = 0; } @@ -405,21 +414,23 @@ static int chooseNewCentroid(handle_t const& handle, obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, + CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); return 0; } /** - * @brief Choose initial cluster centroids for k-means algorithm. + * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy + * @param thrust_exec_policy thrust execution policy * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -439,14 +450,19 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template -static int initializeCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, - unsigned long long seed) { +template +static int initializeCentroids(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -459,7 +475,7 @@ static int initializeCentroids( thrust::uniform_real_distribution uniformDist(0, 1); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); constexpr index_type_t grid_lower_bound{65535}; @@ -471,36 +487,43 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), - 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid - thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), 1); + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids)) + if (chooseNewCentroid( + handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids, dists); + computeDistances<<>>(n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, + thrust_exec_policy, + n, + d, + k, + uniformDist(rng), + obs, + dists, + centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -510,22 +533,20 @@ static int initializeCentroids( CHECK_CUDA(stream); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, - codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, k, codes, - clusterSizes); + computeClusterSizes<<>>(n, k, codes, clusterSizes); CHECK_CUDA(stream); return 0; } -/** - * @brief Find cluster centroids closest to observation vectors. +/** + * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -553,16 +574,21 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { +template +static int assignCentroids(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -574,11 +600,9 @@ static int assignCentroids( constexpr index_type_t grid_lower_bound{65535}; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); gridDim.y = min(k, grid_lower_bound); - gridDim.z = - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - computeDistances<<>>(n, d, k, obs, centroids, - dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); // Find centroid closest to each observation vector @@ -586,23 +610,21 @@ static int assignCentroids( blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, - clusterSizes); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); CHECK_CUDA(stream); // Compute residual sum of squares - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); return 0; } -/** - * @brief Update cluster centroids for k-means algorithm. +/** + * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -628,29 +650,31 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template +template static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, const index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) { + index_type_t* __restrict__ work_int) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const value_type_t one = 1; + const value_type_t one = 1; const value_type_t zero = 0; constexpr index_type_t grid_lower_bound{65535}; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory thrust::device_ptr obs_copy(work); @@ -658,34 +682,56 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (value_type_t*)NULL, n, - thrust::raw_pointer_cast(obs_copy), n, stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::modulus()); CHECK_CUDA(stream); - thrust::gather(thrust_exec_policy, rows, rows + d * n, - thrust::device_pointer_cast(codes), codes_copy); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); CHECK_CUDA(stream); @@ -696,12 +742,11 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; - divideCentroids<<>>(d, k, clusterSizes, - centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); return 0; @@ -715,8 +760,8 @@ namespace raft { // k-means algorithm // ========================================================= -/** - * @brief Find clusters with k-means algorithm. +/** + * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. @@ -754,17 +799,24 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, - value_type_t* residual_host, index_type_t* iters_host, - unsigned long long seed) { + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int, + value_type_t* residual_host, + index_type_t* iters_host, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -786,100 +838,120 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // ------------------------------------------------------- auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), - cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids( + handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), - grid_lower_bound)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, - work); + computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), - thrust::device_pointer_cast(work + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); CHECK_CUDA(stream); return 0; } if (n <= k) { - thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), n, 1); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - codes, clusterSizes, work, seed)) + if (initializeCentroids( + handle, thrust_exec_policy, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids( + handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, + thrust_exec_policy, + n, + d, + k, + obs, + centroids, + work, + codes, + clusterSizes, + residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, - uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, + thrust_exec_policy, + n, + d, + k, + uniformDist(rng), + obs, + work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, + thrust_exec_policy, + n, + d, + k, + obs, + centroids, + work, + codes, + clusterSizes, + residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); CHECK_CUDA(stream); } @@ -891,14 +963,13 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) - WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; return 0; } -/** +/** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with @@ -926,13 +997,20 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, value_type_t& residual, - index_type_t& iters, unsigned long long seed = 123456) { +template +int kmeans(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ using namespace matrix; // Check that parameters are valid @@ -949,10 +1027,22 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( - handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, - clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, - &iters, seed); + return kmeans(handle, + thrust_exec_policy, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d14bf05f37..35fc22c770 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -21,66 +21,125 @@ #include #include -//for now; TODO: check if/where this `define` should be; +// for now; TODO: check if/where this `define` should be; // #define USE_LAPACK namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status \ - << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } -extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, - float *work, int *lwork, int *info); -extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, - double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, - float *a, int *lda, const float *tau, float *c, - int *ldc, float *work, int *lwork, int *info); -extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, - double *a, int *lda, const double *tau, double *c, - int *ldc, double *work, int *lwork, int *info); -extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, - double *wr, double *wi, double *vl, int *ldvl, double *vr, - int *ldvr, double *work, int *lwork, int *info); - -extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, - float *wr, float *wi, float *vl, int *ldvl, float *vr, - int *ldvr, float *work, int *lwork, int *info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const float *alpha, const float *A, int lda, const float *B, int ldb, - const float *beta, float *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const double *alpha, const double *A, int lda, const double *B, int ldb, - const double *beta, double *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, - int n, float *d, float *e, - float *z, int ldz, float *work, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, - int n, double *d, double *e, - double *z, int ldz, - double *work, int *info); +extern "C" void sgeqrf_( + int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); +extern "C" void dgeqrf_( + int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); +extern "C" void sormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + float* a, + int* lda, + const float* tau, + float* c, + int* ldc, + float* work, + int* lwork, + int* info); +extern "C" void dormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + double* a, + int* lda, + const double* tau, + double* c, + int* ldc, + double* work, + int* lwork, + int* info); +extern "C" int dgeev_(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info); + +extern "C" int sgeev_(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost( + const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info); template class Lapack { @@ -91,182 +150,339 @@ class Lapack { public: static void check_lapack_enabled(); - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, + static void gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, int ldc); // special QR for lanczos - static void sterf(int n, T *d, T *e); - static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + static void sterf(int n, T* d, T* e); + static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); // QR // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. // multiply C by implicit Q - static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork); - - static void geev(T *A, T *eigenvalues, int dim, int lda); - static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + static void ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork); + + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, int ldvr); - static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, - T *eigenvectors_i, int dim, int lda, int ldvr); private: - static void lapack_gemm(const char transa, const char transb, int m, int n, - int k, float alpha, const float *a, int lda, - const float *b, int ldb, float beta, float *c, - int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float* a, + int lda, + const float* b, + int ldb, + float beta, + float* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, const signed char transb, - int m, int n, int k, double alpha, const double *a, - int lda, const double *b, int ldb, double beta, - double *c, int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (double *)a, lda, (double *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double* a, + int lda, + const double* b, + int ldb, + double beta, + double* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double*)a, + lda, + (double*)b, + ldb, + &beta, + c, + ldc); } - static void lapack_sterf(int n, float *d, float *e, int *info) { + static void lapack_sterf(int n, float* d, float* e, int* info) + { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) { + static void lapack_sterf(int n, double* d, double* e, int* info) + { cusolverDnDsterfHost(n, d, e, info); } - static void lapack_steqr(const signed char compz, int n, float *d, float *e, - float *z, int ldz, float *work, int *info) { + static void lapack_steqr( + const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) + { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, - double *z, int ldz, double *work, int *info) { + static void lapack_steqr(const signed char compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info) + { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, - float *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) + { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, - double *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) + { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, - int lda, float *tau, float *c, int ldc, float *work, - int *lwork, int *info) { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + float* c, + int ldc, + float* work, + int* lwork, + int* info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, - double *a, int lda, double *tau, double *c, int ldc, - double *work, int *lwork, int *info) { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + double* c, + int ldc, + double* work, + int* lwork, + int* info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, - int *lda, double *wr, double *wi, double *vl, - int *ldvl, double *vr, int *ldvr, - double *work, int *lwork, int *info) { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, - int *lda, float *wr, float *wi, float *vl, - int *ldvl, float *vr, int *ldvr, float *work, - int *lwork, int *info) { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } // real eigenvalues - static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { + static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) + { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T* vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, - &ldv, vl, &ldv, work.data(), &work_size, &info); + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); lapackCheckError(info); } // real eigenpairs - static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, - int lda, int ldvr) { + static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) + { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T* vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), - vl, &ldvl, eigenvectors, &ldvr, work.data(), - &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } // complex eigenpairs - static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, - int lda, int ldvr) { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, - eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, - work.data(), &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } }; template -void Lapack::check_lapack_enabled() { +void Lapack::check_lapack_enabled() +{ #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, - int ldc) { +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, + int ldc) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) { +void Lapack::sterf(int n, T* d, T* e) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -276,7 +492,8 @@ void Lapack::sterf(int n, T *d, T *e) { } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { +void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -286,8 +503,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, - int *lwork) { +void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -296,11 +513,22 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, #endif } template -void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork) { +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -310,7 +538,8 @@ void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { +void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -318,8 +547,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -327,13 +556,18 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, } // complex eigenpairs template -void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, - dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); #endif } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index c43154d17a..89d2b7e8ec 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -40,10 +40,12 @@ using size_type = int; // for now; TODO: move it in appropriate header // Apply diagonal matrix to vector: // template -static __global__ void diagmv(IndexType_ n, ValueType_ alpha, +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, const ValueType_* __restrict__ D, const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) { + ValueType_* __restrict__ y) +{ IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { y[i] += alpha * D[i] * x[i]; @@ -58,7 +60,7 @@ enum struct sparse_mv_alg_t : int { SPARSE_MV_UNDEFINED = -1, SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices }; // Vector "view"-like aggregate for linear algebra purposes @@ -68,21 +70,21 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz) - : buffer_(buffer), size_(sz) {} + vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other) - : buffer_(other.buffer_), size_(other.size_) { + vector_view_t(vector_view_t&& other) : buffer_(other.buffer_), size_(other.size_) + { other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } - vector_view_t& operator=(vector_view_t&& other) { + vector_view_t& operator=(vector_view_t&& other) + { buffer_ = other.buffer_; - size_ = other.size_; + size_ = other.size_; other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } }; @@ -98,15 +100,16 @@ class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) : handle_(raft_handle), - buffer_( - static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), + buffer_(static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), size_(sz), - stream_(raft_handle.get_stream()) {} + stream_(raft_handle.get_stream()) + { + } - ~vector_t(void) { - handle_.get_device_allocator()->deallocate( - buffer_, size_ * sizeof(value_type), stream_); + ~vector_t(void) + { + handle_.get_device_allocator()->deallocate(buffer_, size_ * sizeof(value_type), stream_); } size_type size(void) const { return size_; } @@ -116,26 +119,31 @@ class vector_t { value_type const* raw(void) const { return buffer_; } template - value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, - [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; - auto abs_right = right > 0 ? right : -right; - return abs_left + abs_right; - }); + value_type nrm1(ThrustExecPolicy t_exe_pol) const + { + return thrust::reduce( + t_exe_pol, buffer_, buffer_ + size_, value_type{0}, [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); } template - void fill(ThrustExecPolicy t_exe_pol, value_type value) { + void fill(ThrustExecPolicy t_exe_pol, value_type value) + { thrust::fill_n(t_exe_pol, buffer_, size_, value); } }; template struct sparse_matrix_t { - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const ncols, + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const ncols, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), @@ -143,18 +151,25 @@ struct sparse_matrix_t { values_(values), nrows_(nrows), ncols_(ncols), - nnz_(nnz) {} + nnz_(nnz) + { + } - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), ncols_(nrows), - nnz_(nnz) {} + nnz_(nnz) + { + } template sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) @@ -164,7 +179,9 @@ struct sparse_matrix_t { values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) {} + nnz_(csr_view.number_of_edges) + { + } virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types @@ -174,21 +191,24 @@ struct sparse_matrix_t { // descriptor creation works with non-const, and const-casting // down is dangerous) // - virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + virtual void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const { + bool transpose = false, + bool symmetric = false) const + { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); - auto stream = handle_.get_stream(); + auto stream = handle_.get_stream(); - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP auto size_x = transpose ? nrows_ : ncols_; @@ -196,15 +216,19 @@ struct sparse_matrix_t { cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - //create descriptors: + // create descriptors: //(below casts are necessary, because // cusparseCreateCsr(...) takes non-const // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr( - &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), - const_cast(col_indices_), const_cast(values_))); + CUSPARSE_CHECK(cusparsecreatecsr(&matA, + nrows_, + ncols_, + nnz_, + const_cast(row_offsets_), + const_cast(col_indices_), + const_cast(values_))); cusparseDnVecDescr_t vecX; CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); @@ -212,31 +236,29 @@ struct sparse_matrix_t { cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); - //get (scratch) external device buffer size: + // get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, - vecX, &beta, vecY, spmv_alg, - &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); - //allocate external buffer: + // allocate external buffer: // vector_t external_buffer(handle_, bufferSize); - //finally perform SpMV: + // finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, spmv_alg, external_buffer.raw(), stream)); + CUSPARSE_CHECK(cusparsespmv( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); - //free descriptors: + // free descriptors: //(TODO: maybe wrap them in a RAII struct?) // CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -245,9 +267,20 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, - &alpha, descr, values_, row_offsets_, - col_indices_, x, &beta, y, stream)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, + trans, + nrows_, + ncols_, + nnz_, + &alpha, + descr, + values_, + row_offsets_, + col_indices_, + x, + &beta, + y, + stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } @@ -255,19 +288,18 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const + { switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: - return CUSPARSE_CSRMV_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: - return CUSPARSE_CSRMV_ALG2; - default: - return CUSPARSE_MV_ALG_DEFAULT; + case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2; + default: return CUSPARSE_MV_ALG_DEFAULT; } } #endif - //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, + // aggregate handle_t const& handle_; index_type const* row_offsets_; @@ -284,44 +316,51 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) - : sparse_matrix_t(raft_handle, row_offsets, - col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) { + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) + : sparse_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) + { vector_t ones{raft_handle, nrows}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } template laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, csr_m.row_offsets_, - csr_m.col_indices_, csr_m.values_, - csr_m.nrows_, csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) { + : sparse_matrix_t(raft_handle, + csr_m.row_offsets_, + csr_m.col_indices_, + csr_m.values_, + csr_m.nrows_, + csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) + { vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; + auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // scales y by beta: // @@ -333,8 +372,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim{ - std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); @@ -342,8 +380,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); } vector_t diagonal_; @@ -355,58 +392,68 @@ struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, nrows, nnz) + { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); } template modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_m) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + : laplacian_matrix_t(raft_handle, thrust_exec_policy, csr_m) + { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); value_type dot_res; // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot( - cublas_h, n, laplacian_matrix_t::diagonal_.raw(), - 1, x, 1, &dot_res, stream)); + CUBLAS_CHECK(linalg::cublasdot(cublas_h, + n, + laplacian_matrix_t::diagonal_.raw(), + 1, + x, + 1, + &dot_res, + stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy( - cublas_h, n, &gamma_, - laplacian_matrix_t::diagonal_.raw(), 1, y, 1, - stream)); + CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, + n, + &gamma_, + laplacian_matrix_t::diagonal_.raw(), + 1, + y, + 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index f8dfe5daa3..bb7087a3be 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -40,7 +40,8 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) { +static double timer(void) +{ struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -79,19 +80,27 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + handle_t const& handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -104,11 +113,10 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); @@ -119,8 +127,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -139,12 +147,13 @@ std::tuple modularity_maximization( * @param modularity On exit, modularity */ template -void analyzeModularity(handle_t const &handle, +void analyzeModularity(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + sparse_matrix_t const& csr_m, vertex_t nClusters, - vertex_t const *__restrict__ clusters, - weight_t &modularity) { + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; @@ -152,15 +161,14 @@ void analyzeModularity(handle_t const &handle, weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory vector_t part_i(handle, n); vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; @@ -170,8 +178,8 @@ void analyzeModularity(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator( + handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 841fca04d9..e2576c1d69 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,22 +62,30 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template -std::tuple partition( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { +template +std::tuple partition(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + //cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; @@ -88,22 +96,21 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -130,18 +137,21 @@ std::tuple partition( * @return error flag. */ template -void analyzePartition(handle_t const &handle, +void analyzePartition(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - vertex_t nClusters, const vertex_t *__restrict__ clusters, - weight_t &edgeCut, weight_t &cost) { + sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; vertex_t n = csr_m.nrows_; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); weight_t partEdgesCut, clustersize; @@ -150,22 +160,21 @@ void analyzePartition(handle_t const &handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator( + handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 40dde30a74..5349cb2810 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -28,20 +28,18 @@ namespace raft { namespace spectral { template -static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, - value_type_t* obs) { +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t i, j, k, index, mm; value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -66,17 +64,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -index_type_t next_pow2(index_type_t n) { +index_type_t next_pow2(index_type_t n) +{ index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -90,7 +88,8 @@ index_type_t next_pow2(index_type_t n) { } template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t p2m; // find next power of 2 @@ -102,19 +101,20 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel - <<>>(m, n, obs); + scale_obs_kernel<<>>(m, n, obs); return cudaSuccess; } -template +template void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t n, - vertex_t nEigVecs, weight_t* eigVecs) { + ThrustExePolicy thrust_exec_policy, + edge_t n, + vertex_t nEigVecs, + weight_t* eigVecs) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -123,9 +123,9 @@ void transform_eigen_matrix(handle_t const& handle, for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, @@ -136,8 +136,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::minus()); CHECK_CUDA(stream); - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -154,16 +153,25 @@ void transform_eigen_matrix(handle_t const& handle, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); } } @@ -178,49 +186,48 @@ struct equal_to_i_op { public: equal_to_i_op(index_type_t _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace // Construct indicator vector for ith partition // -template +template bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t index, - edge_t n, weight_t& clustersize, weight_t& partStats, + ThrustExePolicy thrust_exec_policy, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, - vector_t& part_i, vector_t& Bx, - laplacian_matrix_t const& B) { + vector_t& part_i, + vector_t& Bx, + laplacian_matrix_t const& B) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); + auto stream = handle.get_stream(); + + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); - if (clustersize < 0.5) { - return false; - } + if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK( - cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp index 406f1b7c7e..08a4e6efb5 100644 --- a/cpp/include/raft/spectral/warn_dbg.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -4,13 +4,13 @@ #include #define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) -//nope: +// nope: // #define WARNING(message) \ do { \ diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh index 8691cabc85..4d6724482c 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/mean.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,9 +57,7 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } } /** @@ -80,24 +78,22 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, * @param stream: cuda stream */ template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor - <<>>(mu, data, D, N); + meanKernelRowMajor<<>>(mu, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { - meanKernelColMajor - <<>>(mu, data, D, N); + meanKernelColMajor<<>>(mu, data, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh index 04934d4388..c0ba24312b 100644 --- a/cpp/include/raft/stats/mean_center.cuh +++ b/cpp/include/raft/stats/mean_center.cuh @@ -38,12 +38,25 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void meanCenter(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } /** @@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, * @param stream cuda stream where to launch work */ template -void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { +void meanAdd(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } }; // end namespace stats diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh index f12c633829..1dd9cd56bc 100644 --- a/cpp/include/raft/stats/stddev.cuh +++ b/cpp/include/raft/stats/stddev.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColPerBlk has been tested only for 32! template -__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, - IdxType N) { +__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) { Type val = (colId < D) ? data[i * D + colId] : Type(0); thread_data += val * val; @@ -48,41 +48,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, } template -__global__ void stddevKernelColMajor(Type *std, const Type *data, - const Type *mu, IdxType D, IdxType N) { +__global__ void stddevKernelColMajor( + Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - std[blockIdx.x] = raft::mySqrt(acc / N); - } + if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } } template -__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, - IdxType D, IdxType N) { +__global__ void varsKernelColMajor( + Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - var[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } } /** @@ -104,28 +102,33 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(std, data, D, N); + stddevKernelRowMajor<<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - std, std, mu, D, - [ratio] __device__(Type a, Type b) { - return raft::mySqrt(a * ratio - b * b); - }, + std, + std, + mu, + D, + [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, stream); } else { - stddevKernelColMajor - <<>>(std, data, mu, D, N); + stddevKernelColMajor<<>>(std, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } @@ -149,25 +152,28 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, * @param stream cuda stream where to launch work */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(var, data, D, N); + stddevKernelRowMajor<<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - var, var, mu, D, - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); } else { - varsKernelColMajor - <<>>(var, data, mu, D, N); + varsKernelColMajor<<>>(var, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh index 5f8416c7e2..c7b8ce12b6 100644 --- a/cpp/include/raft/stats/sum.cuh +++ b/cpp/include/raft/stats/sum.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,9 +57,7 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } } /** @@ -77,21 +75,19 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, * @param stream cuda stream where to launch work */ template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { - sumKernelColMajor - <<>>(output, input, D, N); + sumKernelColMajor<<>>(output, input, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1829fc0351..1e0885fb99 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -22,11 +22,11 @@ namespace raft { template -struct IOType {}; +struct IOType { +}; template <> struct IOType { - static_assert(sizeof(bool) == sizeof(int8_t), - "IOType bool size assumption failed"); + static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed"); typedef int8_t Type; }; template <> @@ -215,42 +215,42 @@ struct IOType { }; /** - * @struct TxN_t - * - * @brief Internal data structure that is used to define a facade for vectorized - * loads/stores across the most common POD types. The goal of his file is to - * provide with CUDA programmers, an easy way to have compiler issue vectorized - * load or store instructions to memory (either global or shared). Vectorized - * accesses to memory are important as they'll utilize its resources - * efficiently, - * when compared to their non-vectorized counterparts. Obviously, for whatever - * reasons if one is unable to issue such vectorized operations, one can always - * fallback to using POD types. - * - * Example demonstrating the use of load operations, performing math on such - * loaded data and finally storing it back. - * @code{.cu} - * TxN_t mydata1, mydata2; - * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; - * mydata1.load(ptr1, idx); - * mydata2.load(ptr2, idx); - * #pragma unroll - * for(int i=0;i type. - * Only change required is to replace variable declaration appropriately. - * - * Obviously, it's caller's responsibility to take care of pointer alignment! - * - * @tparam math_ the data-type in which the compute/math needs to happen - * @tparam veclen_ the number of 'math_' types to be loaded/stored per - * instruction - */ + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ template struct TxN_t { /** underlying math data type */ @@ -274,7 +274,8 @@ struct TxN_t { * @brief Fill the contents of this structure with a constant value * @param _val the constant to be filled */ - DI void fill(math_t _val) { + DI void fill(math_t _val) + { #pragma unroll for (int i = 0; i < Ratio; ++i) { val.data[i] = _val; @@ -299,21 +300,24 @@ struct TxN_t { * @{ */ template - DI void load(const math_t *ptr, idx_t idx) { - const io_t *bptr = reinterpret_cast(&ptr[idx]); - val.internal = __ldg(bptr); + DI void load(const math_t* ptr, idx_t idx) + { + const io_t* bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); } template - DI void load(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); + DI void load(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); val.internal = *bptr; } template - DI void store(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); - *bptr = val.internal; + DI void store(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; } /** @} */ }; @@ -330,11 +334,17 @@ struct TxN_t { DI void fill(math_t _val) {} template - DI void load(const math_t *ptr, idx_t idx) {} + DI void load(const math_t* ptr, idx_t idx) + { + } template - DI void load(math_t *ptr, idx_t idx) {} + DI void load(math_t* ptr, idx_t idx) + { + } template - DI void store(math_t *ptr, idx_t idx) {} + DI void store(math_t* ptr, idx_t idx) + { + } }; } // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 4ff6cdf5fa..284a873dec 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, ClusterSolvers) { +TEST(Raft, ClusterSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) { index_type d{10}; index_type k{5}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvecs{nullptr}; index_type* codes{nullptr}; @@ -49,11 +50,11 @@ TEST(Raft, ClusterSolvers) { kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, - eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes)); } -TEST(Raft, ModularitySolvers) { +TEST(Raft, ModularitySolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -68,7 +69,7 @@ TEST(Raft, ModularitySolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -82,21 +83,18 @@ TEST(Raft, ModularitySolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW( - spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index c14d880efd..150767992f 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, Utils) { +TEST(Raft, Utils) +{ ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index e2ed2c01dc..9ed32b80ef 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -25,30 +25,42 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x, - const DataType *y, int m, int n, int k, - DataType eps, bool isRowMajor) { +__global__ void naiveDistanceAdjKernel(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc <= eps; } template -void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m, - int n, int k, DataType eps, bool isRowMajor) { +void naiveDistanceAdj(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); - naiveDistanceAdjKernel - <<>>(dist, x, y, m, n, k, eps, isRowMajor); + naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); CUDA_CHECK(cudaPeekAtLastError()); } @@ -61,21 +73,21 @@ struct DistanceAdjInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceAdjInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs& dims) +{ return os; } template -class DistanceAdjTest - : public ::testing::TestWithParam> { +class DistanceAdjTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -89,25 +101,23 @@ class DistanceAdjTest DataType threshold = params.eps; naiveDistanceAdj(dist_ref, x, y, m, n, k, threshold, isRowMajor); - char *workspace = nullptr; - size_t worksize = - raft::distance::getWorkspaceSize(x, y, m, n, k); - if (worksize != 0) { - raft::allocate(workspace, worksize); - } + char* workspace = nullptr; + size_t worksize = raft::distance:: + getWorkspaceSize( + x, y, m, n, k); + if (worksize != 0) { raft::allocate(workspace, worksize); } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { return d_val <= threshold; }; - raft::distance::distance( + raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); @@ -131,13 +141,13 @@ const std::vector> inputsf = { {10.0f, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestF; -TEST_P(DistanceAdjTestF, Result) { +TEST_P(DistanceAdjTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.01, 1024, 1024, 32, true, 1234ULL}, @@ -150,13 +160,13 @@ const std::vector> inputsd = { {10.0, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestD; -TEST_P(DistanceAdjTestD, Result) { +TEST_P(DistanceAdjTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd)); } // namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu index 10bc4d1899..c812a1985d 100644 --- a/cpp/test/distance/dist_canberra.cu +++ b/cpp/test/distance/dist_canberra.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceCanberra - : public DistanceTest {}; +class DistanceCanberra : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraF; -TEST_P(DistanceCanberraF, Result) { +TEST_P(DistanceCanberraF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraD; -TEST_P(DistanceCanberraD, Result) { +TEST_P(DistanceCanberraD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu index 6a2b02863a..0a4a69f059 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_chebyshev.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceLinf - : public DistanceTest {}; +class DistanceLinf : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfF; -TEST_P(DistanceLinfF, Result) { +TEST_P(DistanceLinfF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfD; -TEST_P(DistanceLinfD, Result) { +TEST_P(DistanceLinfD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu index 291c4196f9..f7510c17b1 100644 --- a/cpp/test/distance/dist_cos.cu +++ b/cpp/test/distance/dist_cos.cu @@ -21,9 +21,8 @@ namespace raft { namespace distance { template -class DistanceExpCos - : public DistanceTest {}; +class DistanceExpCos : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +35,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosF; -TEST_P(DistanceExpCosF, Result) { +TEST_P(DistanceExpCosF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +54,13 @@ const std::vector> inputsd = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosD; -TEST_P(DistanceExpCosD, Result) { +TEST_P(DistanceExpCosD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu index 46e7ded0ec..e90d0e83dc 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_euc_exp.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceEucExpTest - : public DistanceTest {}; +class DistanceEucExpTest : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestF; -TEST_P(DistanceEucExpTestF, Result) { +TEST_P(DistanceEucExpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +54,13 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestD; -TEST_P(DistanceEucExpTestD, Result) { +TEST_P(DistanceEucExpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu index 92f424647d..90412a9cb2 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_euc_unexp.cu @@ -36,14 +36,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestF; -TEST_P(DistanceEucUnexpTestF, Result) { +TEST_P(DistanceEucUnexpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +55,13 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestD; -TEST_P(DistanceEucUnexpTestD, Result) { +TEST_P(DistanceEucUnexpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu index 39dc7aaeff..95b1908dc1 100644 --- a/cpp/test/distance/dist_hellinger.cu +++ b/cpp/test/distance/dist_hellinger.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHellingerExp - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpF; -TEST_P(DistanceHellingerExpF, Result) { +TEST_P(DistanceHellingerExpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpD; -TEST_P(DistanceHellingerExpD, Result) { +TEST_P(DistanceHellingerExpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu index bd32837e45..d14f8d8a0b 100644 --- a/cpp/test/distance/dist_l1.cu +++ b/cpp/test/distance/dist_l1.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceUnexpL1 - : public DistanceTest {}; +class DistanceUnexpL1 : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1F; -TEST_P(DistanceUnexpL1F, Result) { +TEST_P(DistanceUnexpL1F, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1D; -TEST_P(DistanceUnexpL1D, Result) { +TEST_P(DistanceUnexpL1D, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu index 42b8e294ac..cc6a5f60de 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_minkowski.cu @@ -21,8 +21,7 @@ namespace raft { namespace distance { template -class DistanceLpUnexp - : public DistanceTest { +class DistanceLpUnexp : public DistanceTest { }; const std::vector> inputsf = { @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, }; typedef DistanceLpUnexp DistanceLpUnexpF; -TEST_P(DistanceLpUnexpF, Result) { +TEST_P(DistanceLpUnexpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, }; typedef DistanceLpUnexp DistanceLpUnexpD; -TEST_P(DistanceLpUnexpD, Result) { +TEST_P(DistanceLpUnexpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index fc7b064205..a99d307abb 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -25,43 +25,52 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, int k, +__global__ void naiveDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, raft::distance::DistanceType type, - bool isRowMajor) { + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) acc = raft::mySqrt(acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveL1_Linf_CanberraDistanceKernel( - DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, - raft::distance::DistanceType type, bool isRowMajor) { +__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { acc = raft::myMax(acc, diff); @@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel( } } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCosineDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } - DataType acc_a = DataType(0); - DataType acc_b = DataType(0); + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_a += a * a; acc_b += b * b; acc_ab += a * b; @@ -106,64 +113,74 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = - (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } template -__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHellingerDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_ab += raft::mySqrt(a) * raft::mySqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - acc_ab = 1 - acc_ab; + acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); } template -__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor, DataType p) { +__global__ void naiveLpUnexpDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + bool isRowMajor, + DataType p) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = raft::L1Op()(a - b); acc += raft::myPow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; - dist[outidx] = acc; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; } template -void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, - int n, int k, raft::distance::DistanceType type, - bool isRowMajor, DataType metric_arg = 2.0f) { +void naiveDistance(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); @@ -178,23 +195,19 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - naiveDistanceKernel - <<>>(dist, x, y, m, n, k, type, isRowMajor); + naiveDistanceKernel<<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - naiveCosineDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCosineDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - naiveHellingerDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHellingerDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; - default: - FAIL() << "should be here\n"; + default: FAIL() << "should be here\n"; } CUDA_CHECK(cudaPeekAtLastError()); } @@ -209,37 +222,47 @@ struct DistanceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceInputs& dims) +{ return os; } template -void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, - int m, int n, int k, DistanceInputs ¶ms, - DataType threshold, char *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor, - DataType metric_arg = 2.0f) { +void distanceLauncher(DataType* x, + DataType* y, + DataType* dist, + DataType* dist2, + int m, + int n, + int k, + DistanceInputs& params, + DataType threshold, + char* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } template class DistanceTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; DataType metric_arg = params.metric_arg; - bool isRowMajor = params.isRowMajor; + bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(x, m * k); @@ -256,25 +279,33 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, - metric_arg); - char *workspace = nullptr; + naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg); + char* workspace = nullptr; size_t worksize = - raft::distance::getWorkspaceSize(x, y, m, n, k); - if (worksize != 0) { - raft::allocate(workspace, worksize); - } + raft::distance::getWorkspaceSize(x, y, m, n, k); + if (worksize != 0) { raft::allocate(workspace, worksize); } DataType threshold = -10000.f; - distanceLauncher(x, y, dist, dist2, m, n, k, params, - threshold, workspace, worksize, - stream, isRowMajor, metric_arg); + distanceLauncher(x, + y, + dist, + dist2, + m, + n, + k, + params, + threshold, + workspace, + worksize, + stream, + isRowMajor, + metric_arg); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 4573a070b6..a7b763a2bc 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -29,40 +29,40 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, - DataT *y, int m, int n, int k, int *workspace, - DataT maxVal) { - int midx = threadIdx.y + blockIdx.y * blockDim.y; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void naiveKernel(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + DataT maxVal) +{ + int midx = threadIdx.y + blockIdx.y * blockDim.y; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; DataT acc = DataT(0); for (int i = 0; i < k; ++i) { - int xidx = i + midx * k; - int yidx = i + nidx * k; + int xidx = i + midx * k; + int yidx = i + nidx * k; auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { - acc = raft::mySqrt(acc); - } + if (Sqrt) { acc = raft::mySqrt(acc); } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; cub::KeyValuePair tmp; - tmp.key = nidx; + tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; - tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); + tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); if (threadIdx.x % raft::WarpSize == 0 && midx < m) { while (atomicCAS(workspace + midx, 0, 1) == 1) ; @@ -74,8 +74,15 @@ __global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, } template -void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, - int k, int *workspace, cudaStream_t stream) { +void naive(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + cudaStream_t stream) +{ static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); @@ -85,8 +92,7 @@ void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, <<>>(min, m, std::numeric_limits::max(), op); CUDA_CHECK(cudaGetLastError()); naiveKernel, 16> - <<>>(min, x, y, m, n, k, workspace, - std::numeric_limits::max()); + <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); CUDA_CHECK(cudaGetLastError()); } @@ -100,7 +106,8 @@ struct Inputs { template class FusedL2NNTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int m = params.m; @@ -121,7 +128,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(x)); @@ -136,25 +144,38 @@ class FusedL2NNTest : public ::testing::TestWithParam> { protected: Inputs params; DataT *x, *y, *xn, *yn; - char *workspace; - cub::KeyValuePair *min, *min_ref; + char* workspace; + cub::KeyValuePair*min, *min_ref; cudaStream_t stream; - virtual void generateGoldenResult() { + virtual void generateGoldenResult() + { int m = params.m; int n = params.n; int k = params.k; - naive(min_ref, x, y, m, n, k, (int *)workspace, stream); + naive(min_ref, x, y, m, n, k, (int*)workspace, stream); } - void runTest(cub::KeyValuePair *out) { + void runTest(cub::KeyValuePair* out) + { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>( - out, x, y, xn, yn, m, n, k, (void *)workspace, redOp, - raft::distance::KVPMinReduce(), Sqrt, true, stream); + fusedL2NN, int>(out, + x, + y, + xn, + yn, + m, + n, + k, + (void*)workspace, + redOp, + raft::distance::KVPMinReduce(), + Sqrt, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } }; @@ -163,9 +184,10 @@ template struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} - bool operator()(const KVP &a, const KVP &b) const { - T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); - T m = std::max(raft::abs(a.value), raft::abs(b.value)); + bool operator()(const KVP& a, const KVP& b) const + { + T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); + T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; return (ratio <= eps); } @@ -177,17 +199,20 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; - bool operator()(const KVP &a, const KVP &b) const { + bool operator()(const KVP& a, const KVP& b) const + { if (a.value != b.value) return false; return true; } }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair *expected, - const cub::KeyValuePair *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, + const cub::KeyValuePair* actual, + size_t size, + L eq_compare, + cudaStream_t stream = 0) +{ typedef typename cub::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); @@ -199,47 +224,42 @@ template auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { return ::testing::AssertionFailure() - << "actual=" << act.key << "," << act.value - << " != expected=" << exp.key << "," << exp.value << " @" << i; + << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << "," + << exp.value << " @" << i; } } return ::testing::AssertionSuccess(); } const std::vector> inputsf = { - {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, - {0.001f, 64, 32, 32, 1234ULL}, {0.001f, 64, 64, 32, 1234ULL}, - {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, + {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, {0.001f, 64, 32, 32, 1234ULL}, + {0.001f, 64, 64, 32, 1234ULL}, {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL}, - {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, - {0.001f, 64, 32, 34, 1234ULL}, {0.001f, 64, 64, 34, 1234ULL}, - {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, + {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, {0.001f, 64, 32, 34, 1234ULL}, + {0.001f, 64, 64, 34, 1234ULL}, {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL}, - {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, - {0.001f, 64, 32, 33, 1234ULL}, {0.001f, 64, 64, 33, 1234ULL}, - {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, + {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, {0.001f, 64, 32, 33, 1234ULL}, + {0.001f, 64, 64, 33, 1234ULL}, {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL}, {0.006f, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestF_Sq; -TEST_P(FusedL2NNTestF_Sq, Result) { +TEST_P(FusedL2NNTestF_Sq, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNTest FusedL2NNTestF_Sqrt; -TEST_P(FusedL2NNTestF_Sqrt, Result) { +TEST_P(FusedL2NNTestF_Sqrt, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.00001, 32, 32, 32, 1234ULL}, {0.00001, 32, 64, 32, 1234ULL}, @@ -260,38 +280,38 @@ const std::vector> inputsd = { {0.00001, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestD_Sq; -TEST_P(FusedL2NNTestD_Sq, Result) { +TEST_P(FusedL2NNTestD_Sq, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNTest FusedL2NNTestD_Sqrt; -TEST_P(FusedL2NNTestD_Sqrt, Result) { +TEST_P(FusedL2NNTestD_Sqrt, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd)); /// This is to test output determinism of the prim template class FusedL2NNDetTest : public FusedL2NNTest { - void SetUp() override { + void SetUp() override + { FusedL2NNTest::SetUp(); int m = this->params.m; raft::allocate(min1, m); } - void TearDown() override { + void TearDown() override + { FusedL2NNTest::TearDown(); CUDA_CHECK(cudaFree(min1)); } protected: - cub::KeyValuePair *min1; + cub::KeyValuePair* min1; static const int NumRepeats = 100; @@ -299,46 +319,46 @@ class FusedL2NNDetTest : public FusedL2NNTest { }; typedef FusedL2NNDetTest FusedL2NNDetTestF_Sq; -TEST_P(FusedL2NNDetTestF_Sq, Result) { +TEST_P(FusedL2NNDetTestF_Sq, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestF_Sqrt; -TEST_P(FusedL2NNDetTestF_Sqrt, Result) { +TEST_P(FusedL2NNDetTestF_Sqrt, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sq; -TEST_P(FusedL2NNDetTestD_Sq, Result) { +TEST_P(FusedL2NNDetTestD_Sq, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sqrt; -TEST_P(FusedL2NNDetTestD_Sqrt, Result) { +TEST_P(FusedL2NNDetTestD_Sqrt, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index e6ee09262e..e14841eb54 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, EigenSolvers) { +TEST(Raft, EigenSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -35,10 +36,10 @@ TEST(Raft, EigenSolvers) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; - auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); @@ -49,7 +50,7 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; @@ -60,14 +61,13 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); - EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } -TEST(Raft, SpectralSolvers) { +TEST(Raft, SpectralSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -82,7 +82,7 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -96,22 +96,19 @@ TEST(Raft, SpectralSolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); auto t_exe_p = thrust::cuda::par.on(stream); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW( - spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 4cb9809844..8023fca319 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -22,7 +22,8 @@ namespace raft { -TEST(Raft, HandleDefault) { +TEST(Raft, HandleDefault) +{ handle_t h; ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); @@ -33,7 +34,8 @@ TEST(Raft, HandleDefault) { ASSERT_NE(nullptr, h.get_cusparse_handle()); } -TEST(Raft, Handle) { +TEST(Raft, Handle) +{ handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; @@ -44,13 +46,15 @@ TEST(Raft, Handle) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, GetInternalStreams) { +TEST(Raft, GetInternalStreams) +{ handle_t h(4); auto streams = h.get_internal_streams(); ASSERT_EQ(4U, streams.size()); } -TEST(Raft, GetHandleFromPool) { +TEST(Raft, GetHandleFromPool) +{ handle_t parent(4); handle_t child(parent, 2); @@ -64,7 +68,8 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleFromPoolPerf) { +TEST(Raft, GetHandleFromPoolPerf) +{ handle_t parent(100); auto start = curTimeMillis(); for (int i = 0; i < parent.get_num_internal_streams(); i++) { @@ -76,13 +81,13 @@ TEST(Raft, GetHandleFromPoolPerf) { ASSERT_LE(curTimeMillis() - start, 10); } -TEST(Raft, GetHandleStreamViews) { +TEST(Raft, GetHandleStreamViews) +{ handle_t parent(4); handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); - ASSERT_EQ(parent.get_internal_stream_view(2).value(), - child.get_stream_view().value()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); EXPECT_FALSE(child.get_stream_view().is_default()); } } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp index 830d085a40..d883de59fe 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/integer_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, rounding_up) { +TEST(Raft, rounding_up) +{ ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); @@ -29,7 +30,8 @@ TEST(Raft, rounding_up) { ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); } -TEST(Raft, is_a_power_of_two) { +TEST(Raft, is_a_power_of_two) +{ ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); } diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index dc2846fdba..209bb0355a 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -36,7 +36,8 @@ class labelTest : public ::testing::Test { }; typedef labelTest MakeMonotonicTest; -TEST_F(MakeMonotonicTest, Result) { +TEST_F(MakeMonotonicTest, Result) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -48,17 +49,14 @@ TEST_F(MakeMonotonicTest, Result) { raft::allocate(actual, m, true); raft::allocate(expected, m, true); - float *data_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; + float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; - float *expected_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; + float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); + std::shared_ptr allocator(new raft::mr::device::default_allocator); make_monotonic(actual, data, m, stream, allocator); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -73,37 +71,36 @@ TEST_F(MakeMonotonicTest, Result) { delete expected_h; } -TEST(labelTest, Classlabels) { +TEST(labelTest, Classlabels) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); + std::shared_ptr allocator(new raft::mr::device::default_allocator); int n_rows = 6; - float *y_d; + float* y_d; raft::allocate(y_d, n_rows); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); int n_classes; - float *y_unique_d; + float* y_unique_d; getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, - raft::Compare(), stream)); + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, raft::Compare(), stream)); - float *y_relabeled_d; + float* y_relabeled_d; raft::allocate(y_relabeled_d, n_rows); getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; - EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(y_d)); diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..3d930ff22e 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -39,8 +39,7 @@ struct MergeLabelsInputs { }; template -class MergeLabelsTest - : public ::testing::TestWithParam> { +class MergeLabelsTest : public ::testing::TestWithParam> { protected: MergeLabelsTest() : params(::testing::TestWithParam>::GetParam()), @@ -50,25 +49,23 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(1, stream) {} - - void Run() { - raft::update_device(labels_a.data(), params.labels_a.data(), params.N, - stream); - raft::update_device(labels_b.data(), params.labels_b.data(), params.N, - stream); - raft::update_device(expected.data(), params.expected.data(), params.N, - stream); - raft::update_device(mask.data(), - reinterpret_cast(params.mask.data()), params.N, - stream); - - merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(), - m.data(), params.N, stream); + m(1, stream) + { + } + + void Run() + { + raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream); + raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream); + raft::update_device(expected.data(), params.expected.data(), params.N, stream); + raft::update_device(mask.data(), reinterpret_cast(params.mask.data()), params.N, stream); + + merge_labels( + labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream); cudaStreamSynchronize(stream); - ASSERT_TRUE(raft::devArrMatch(expected.data(), labels_a.data(), - params.N, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + expected.data(), labels_a.data(), params.N, raft::Compare())); } protected: @@ -85,22 +82,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); } using MergeLabelsTestL = MergeLabelsTest; TEST_P(MergeLabelsTestL, Result) { Run(); } -constexpr int MAX32 = std::numeric_limits::max(); +constexpr int MAX32 = std::numeric_limits::max(); constexpr int64_t MAX64 = std::numeric_limits::max(); const std::vector> merge_inputs_32 = { {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX32}, - {1, 2, MAX32, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX32, 1, 3, MAX32}, {1, 2, 3, 2, MAX32, 2, 2, 2}, @@ -116,16 +105,8 @@ const std::vector> merge_inputs_32 = { const std::vector> merge_inputs_64 = { {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX64}, - {1, 2, MAX64, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX64, 1, 3, MAX64}, {1, 2, 3, 2, MAX64, 2, 2, 2}, @@ -138,10 +119,8 @@ const std::vector> merge_inputs_64 = { {1, 1, 1, 1, 1, 7, 7, 7}}, }; -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, - ::testing::ValuesIn(merge_inputs_32)); -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, - ::testing::ValuesIn(merge_inputs_64)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64)); } // namespace label } // namespace raft diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 04f473f836..61c7182c72 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -29,11 +29,11 @@ #include #include -#define PROBLEMSIZE 1000 // Number of rows/columns -#define BATCHSIZE 10 // Number of problems in the batch -#define COSTRANGE 1000 +#define PROBLEMSIZE 1000 // Number of rows/columns +#define BATCHSIZE 10 // Number of problems in the batch +#define COSTRANGE 1000 #define PROBLEMCOUNT 1 -#define REPETITIONS 1 +#define REPETITIONS 1 #define SEED 01010001 @@ -43,41 +43,43 @@ namespace raft { // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE]. template -void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) { +void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange) +{ long N2 = SP * N * N; std::uniform_int_distribution distribution(0, costrange); for (long i = 0; i < N2; i++) { - int val = distribution(generator); + int val = distribution(generator); cost_matrix[i] = (weight_t)val; } } template -void hungarian_test(int problemsize, int costrange, int problemcount, - int repetitions, int batchsize, weight_t epsilon, - bool verbose = false) { +void hungarian_test(int problemsize, + int costrange, + int problemcount, + int repetitions, + int batchsize, + weight_t epsilon, + bool verbose = false) +{ raft::handle_t handle; - weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize]; + weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize]; for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize * problemsize); raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); - raft::update_device(elements_v.data(), h_cost, - batchsize * problemsize * problemsize, - handle.get_stream()); + raft::update_device( + elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream()); for (int i = 0; i < repetitions; i++) { float start = omp_get_wtime(); @@ -87,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount, handle, problemsize, batchsize, epsilon); // Solve LAP(s) for given cost matrix - lpx.solve(elements_v.data(), row_assignment_v.data(), - col_assignment_v.data()); + lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data()); float end = omp_get_wtime(); float total_time = (end - start); if (verbose) { - // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match. + // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual + // objectives. At optimality both values should match. for (int k = 0; k < batchsize; k++) { - std::cout << j << ":" << i << ":" << k << ":" - << lpx.getPrimalObjectiveValue(k) << ":" - << lpx.getDualObjectiveValue(k) << ":" << total_time - << std::endl; + std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":" + << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl; } } } @@ -109,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount, delete[] h_cost; } -TEST(Raft, HungarianIntFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianIntFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianIntDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianIntDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianIntLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianIntLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } -TEST(Raft, HungarianLongFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianLongFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianLongDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, - REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianLongDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianLongLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianLongLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } } // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 2fc9d4e30f..38e189f27e 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -27,7 +27,8 @@ namespace linalg { template class AddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -42,7 +43,8 @@ class AddTest : public ::testing::TestWithParam> { add(out, in1, in2, len, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); @@ -51,9 +53,10 @@ class AddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); + void compare() + { + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } protected: diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 137419758f..1d9352bfc1 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -23,18 +23,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, - int len) { +__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = OutT(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); } } template -void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { +void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -47,8 +46,8 @@ struct AddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const AddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const AddInputs& dims) +{ return os; } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 3ae4f86066..078c41356a 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -29,20 +29,19 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void binaryOpLaunch( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -class BinaryOpTest - : public ::testing::TestWithParam> { +class BinaryOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - BinaryOpInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -59,7 +58,8 @@ class BinaryOpTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -72,67 +72,61 @@ class BinaryOpTest OutType *out_ref, *out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32; -TEST_P(BinaryOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i64; -TEST_P(BinaryOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; -TEST_P(BinaryOpTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32_D, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; -TEST_P(BinaryOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i64; -TEST_P(BinaryOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); template class BinaryOpAlignment : public ::testing::Test { protected: - BinaryOpAlignment() { + BinaryOpAlignment() + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void Misaligned() { + void Misaligned() + { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; @@ -142,8 +136,12 @@ class BinaryOpAlignment : public ::testing::Test { CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( - z.data() + 9, x.data() + 137, y.data() + 19, 256, - [] __device__(math_t x, math_t y) { return x + y; }, stream); + z.data() + 9, + x.data() + 137, + y.data() + 19, + 256, + [] __device__(math_t x, math_t y) { return x + y; }, + stream); } raft::handle_t handle; diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index fd8ed6dd1e..97cb3ecb24 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -24,18 +24,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len) { +__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); - if (idx < len) { - out[idx] = static_cast(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = static_cast(in1[idx] + in2[idx]); } } template -void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { +void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ static const IdxType TPB = 64; - IdxType nblks = raft::ceildiv(len, TPB); + IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -48,8 +47,8 @@ struct BinaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const BinaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) +{ return os; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 00236d53fa..5bbe3166cf 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test { L(allocator, handle.get_stream(), n_rows * n_rows), L_exp(allocator, handle.get_stream(), n_rows * n_rows), devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) { + workspace(allocator, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test { int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace // requirements. - raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr, - &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes); workspace.resize(Lwork, stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testR1Update() { + void testR1Update() + { int n = n_rows * n_rows; - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), n, stream); for (int rank = 1; rank <= n_rows; rank++) { std::stringstream ss; - ss << "Rank " << rank - << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); + ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); SCOPED_TRACE(ss.str()); // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf( - solver_handle, uplo, rank, L_exp.data(), n_rows, - (math_t*)workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + stream)); // Incremental Cholesky factorization using rank one updates. - raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows, - workspace.data(), &Lwork, uplo, - stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream); - ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank, - raft::CompareApprox(3e-3))); + ASSERT_TRUE(raft::devArrMatch( + L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox(3e-3))); } } } - void testR1Error() { + void testR1Error() + { raft::update_device(G.data(), G2_host, 4, stream); - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), 4, stream); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream)); - ASSERT_THROW( - raft::linalg::choleskyRank1Update( - handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), - raft::exception); + ASSERT_THROW(raft::linalg::choleskyRank1Update( + handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), + raft::exception); math_t eps = std::numeric_limits::epsilon(); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index e45f5651b4..2760d522bc 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -33,8 +33,8 @@ struct coalescedReductionInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const coalescedReductionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) +{ return os; } @@ -42,17 +42,18 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream, bool inplace = false) { - coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, - [] __device__(T in, int i) { return in * in; }); +void coalescedReductionLaunch( + T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false) +{ + coalescedReduction( + dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; }); } template -class coalescedReductionTest - : public ::testing::TestWithParam> { +class coalescedReductionTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -73,7 +74,8 @@ class coalescedReductionTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -84,34 +86,36 @@ class coalescedReductionTest T *data, *dots_exp, *dots_act; }; -const std::vector> inputsf = { - {0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; -TEST_P(coalescedReductionTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } typedef coalescedReductionTest coalescedReductionTestD; -TEST_P(coalescedReductionTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 2396558939..d8995ffa0a 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -25,30 +25,27 @@ namespace raft { namespace linalg { template -__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] / scalar; - } + if (idx < len) { out[idx] = in[idx] / scalar; } } template -void naiveDivide(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveDivideKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } template -class DivideTest - : public ::testing::TestWithParam> { +class DivideTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; cudaStream_t stream; @@ -63,7 +60,8 @@ class DivideTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -74,25 +72,21 @@ class DivideTest T *in, *out_ref, *out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef DivideTest DivideTestF; -TEST_P(DivideTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(DivideTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf)); typedef DivideTest DivideTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(DivideTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 159d288174..5cad657dab 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -35,14 +35,16 @@ struct EigInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigInputs& dims) +{ return os; } template class EigTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; stream = handle.get_stream(); @@ -51,8 +53,8 @@ class EigTest : public ::testing::TestWithParam> { int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); @@ -61,10 +63,23 @@ class EigTest : public ::testing::TestWithParam> { raft::allocate(eig_vectors_jacobi, len); raft::allocate(eig_vals_jacobi, params.n_col); - T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, - 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, - 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + T eig_vectors_ref_h[] = {0.2790, + -0.6498, + 0.6498, + -0.2789, + -0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; raft::allocate(eig_vectors_ref, len); raft::allocate(eig_vals_ref, params.n_col); @@ -72,13 +87,19 @@ class EigTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); - eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, - stream); + eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, stream); - T tol = 1.e-7; + T tol = 1.e-7; int sweeps = 15; - eigJacobi(handle, cov_matrix, params.n_row, params.n_col, - eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps); + eigJacobi(handle, + cov_matrix, + params.n_row, + params.n_col, + eig_vectors_jacobi, + eig_vals_jacobi, + stream, + tol, + sweeps); // test code for comparing two methods len = params.n * params.n; @@ -90,14 +111,20 @@ class EigTest : public ::testing::TestWithParam> { r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); - eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, - eig_vals_large, stream); - eigJacobi(handle, cov_matrix_large, params.n, params.n, - eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol, + eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, eig_vals_large, stream); + eigJacobi(handle, + cov_matrix_large, + params.n, + params.n, + eig_vectors_jacobi_large, + eig_vals_jacobi_large, + stream, + tol, sweeps); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vectors_jacobi)); @@ -109,89 +136,95 @@ class EigTest : public ::testing::TestWithParam> { protected: EigInputs params; - T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, - *eig_vals_jacobi, *eig_vals_ref; + T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, *eig_vals_jacobi, + *eig_vals_ref; - T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, - *eig_vals_large, *eig_vals_jacobi_large; + T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, *eig_vals_large, + *eig_vals_jacobi_large; cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigTest EigTestValF; -TEST_P(EigTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValD; -TEST_P(EigTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecF; -TEST_P(EigTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecD; -TEST_P(EigTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiF; -TEST_P(EigTestValJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiD; -TEST_P(EigTestValJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiF; -TEST_P(EigTestVecJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, + eig_vectors_jacobi, + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiD; -TEST_P(EigTestVecJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, + eig_vectors_jacobi, + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareF; -TEST_P(EigTestVecCompareF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, + eig_vectors_jacobi_large, + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareD; -TEST_P(EigTestVecCompareD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, + eig_vectors_jacobi_large, + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); @@ -202,17 +235,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3980f281d..b3cfb19174 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -37,32 +37,44 @@ struct EigSelInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigSelInputs& dims) +{ return os; } template class EigSelTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; stream = handle.get_stream(); - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); raft::allocate(eig_vectors, 12); raft::allocate(eig_vals, params.n_col); - T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, - -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + T eig_vectors_ref_h[] = {-0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; raft::allocate(eig_vectors_ref, 12); raft::allocate(eig_vals_ref, params.n_col); @@ -70,11 +82,19 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); - eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, - eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + eigSelDC(handle, + cov_matrix, + params.n_row, + params.n_col, + 3, + eig_vectors, + eig_vals, + EigVecMemUsage::OVERWRITE_INPUT, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vals)); @@ -89,51 +109,45 @@ class EigSelTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigSelTest EigSelTestValF; -TEST_P(EigSelTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestValD; -TEST_P(EigSelTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecF; -TEST_P(EigSelTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecD; -TEST_P(EigSelTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 572951c557..f0e04403e8 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -26,19 +26,17 @@ namespace linalg { //// Testing unary ops template -__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = scalar * in[idx]; - } + if (idx < len) { out[idx] = scalar * in[idx]; } } template -void naiveScale(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -52,19 +50,19 @@ struct ScalarMultiplyInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ScalarMultiplyInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs& dims) +{ return os; } template -class ScalarMultiplyTest - : public ::testing::TestWithParam> { +class ScalarMultiplyTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; T scalar = params.scalar; cudaStream_t stream; @@ -78,7 +76,8 @@ class ScalarMultiplyTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -89,46 +88,41 @@ class ScalarMultiplyTest T *in, *out_ref, *out; }; -const std::vector> inputsf1 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; const std::vector> inputsd1 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef ScalarMultiplyTest ScalarMultiplyTestF; -TEST_P(ScalarMultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } typedef ScalarMultiplyTest ScalarMultiplyTestD; -TEST_P(ScalarMultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, - ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1)); -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, - ::testing::ValuesIn(inputsd1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1)); //// Testing binary ops template -__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, - int len) { +__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] + in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] + in2[idx]; } } template -void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -141,15 +135,16 @@ struct EltwiseAddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const EltwiseAddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs& dims) +{ return os; } template class EltwiseAddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -167,7 +162,8 @@ class EltwiseAddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -179,29 +175,25 @@ class EltwiseAddTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef EltwiseAddTest EltwiseAddTestF; -TEST_P(EltwiseAddTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } typedef EltwiseAddTest EltwiseAddTestD; -TEST_P(EltwiseAddTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index cecfc5eb8e..e95dbbc502 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -36,9 +36,9 @@ struct GemmLayoutInputs { // Reference GEMM implementation. template -__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, - bool isZColMajor, bool isXColMajor, - bool isYColMajor) { +__global__ void naiveGemm( + T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor) +{ int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; @@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, temp += X[xIndex] * Y[yIndex]; } int zIndex = isZColMajor ? m + n * M : m * N + n; - Z[zIndex] = temp; + Z[zIndex] = temp; } } } @@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, template class GemmLayoutTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { // Dimensions of Y : K x N // Dimensions of Z : M x N - T *X = NULL; // Argument X - T *Y = NULL; // Argument Y + T* X = NULL; // Argument X + T* Y = NULL; // Argument Y size_t xElems = params.M * params.K; size_t yElems = params.K * params.N; @@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam> { r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); - dim3 blocks(raft::ceildiv(params.M, 128), - raft::ceildiv(params.N, 4), 1); + dim3 blocks(raft::ceildiv(params.M, 128), raft::ceildiv(params.N, 4), 1); dim3 threads(128, 4, 1); - naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, - params.zLayout, params.xLayout, - params.yLayout); - - gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, - params.xLayout, params.yLayout, stream); + naiveGemm<<>>( + refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout); + + gemm(handle, + Z, + X, + Y, + params.M, + params.N, + params.K, + params.zLayout, + params.xLayout, + params.yLayout, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(refZ)); CUDA_CHECK(cudaFree(Z)); } protected: GemmLayoutInputs params; - T *refZ = NULL; // Reference result for comparison - T *Z = NULL; // Computed result + T* refZ = NULL; // Reference result for comparison + T* Z = NULL; // Computed result }; const std::vector> inputsf = { @@ -131,22 +140,20 @@ const std::vector> inputsd = { {50, 80, 60, false, false, false, 893038ULL}}; typedef GemmLayoutTest GemmLayoutTestF; -TEST_P(GemmLayoutTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-4))); +TEST_P(GemmLayoutTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); } typedef GemmLayoutTest GemmLayoutTestD; -TEST_P(GemmLayoutTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-6))); +TEST_P(GemmLayoutTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-6))); } -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 227bce6a48..0e33d9758f 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -26,13 +26,22 @@ namespace raft { namespace linalg { template -void mapLaunch(OutType *out, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void mapLaunch(OutType* out, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ map( - out, len, + out, + len, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - stream, in1, in2, in3); + stream, + in1, + in2, + in3); } template @@ -44,10 +53,15 @@ struct MapInputs { }; template -void create_ref(OutType *out_ref, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { - InType *tmp; +void create_ref(OutType* out_ref, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ + InType* tmp; allocate(tmp, len); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); @@ -56,12 +70,11 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, } template -class MapTest - : public ::testing::TestWithParam> { +class MapTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -81,7 +94,8 @@ class MapTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(in3)); @@ -95,55 +109,47 @@ class MapTest OutType *out_ref, *out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL, 3.2}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}}; typedef MapTest MapTestF_i32; -TEST_P(MapTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL, 9.4}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}}; typedef MapTest MapTestF_i64; -TEST_P(MapTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL, 5.9}}; typedef MapTest MapTestF_i32_D; -TEST_P(MapTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32_D, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL, 7.5}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}}; typedef MapTest MapTestD_i32; -TEST_P(MapTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL, 5.2}}; typedef MapTest MapTestD_i64; -TEST_P(MapTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 6e146fa4bb..a1b82e7644 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -25,21 +25,18 @@ namespace raft { namespace linalg { template -__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, - MapOp map) { +__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - raft::myAtomicAdd(out, (OutType)map(in[idx])); - } + if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); } } template -void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, - cudaStream_t stream) { +void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel - <<>>(out, in, len, map); + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel<<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -51,7 +48,8 @@ struct MapReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs& dims) +{ return os; } @@ -59,8 +57,9 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, - size_t len, cudaStream_t stream) { +void mapReduceLaunch( + OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream) +{ auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); @@ -69,7 +68,8 @@ void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, template class MapReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); auto len = params.len; @@ -84,7 +84,8 @@ class MapReduceTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -92,48 +93,44 @@ class MapReduceTest : public ::testing::TestWithParam> { protected: MapReduceInputs params; - InType *in; + InType* in; OutType *out_ref, *out; }; -const std::vector> inputsf = { - {0.001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.001f, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestFF; -TEST_P(MapReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf)); typedef MapReduceTest MapReduceTestFD; -TEST_P(MapReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); -const std::vector> inputsd = { - {0.000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.000001, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestDD; -TEST_P(MapReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestDD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); template class MapGenericReduceTest : public ::testing::Test { - using InType = typename T::first_type; + using InType = typename T::first_type; using OutType = typename T::second_type; protected: MapGenericReduceTest() : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) { + output(allocator, handle.get_stream(), 1) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -142,7 +139,8 @@ class MapGenericReduceTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void initInput(InType *input, int n, cudaStream_t stream) { + void initInput(InType* input, int n, cudaStream_t stream) + { raft::random::Rng r(137); r.uniform(input, n, InType(2), InType(3), stream); InType val = 1; @@ -151,21 +149,19 @@ class MapGenericReduceTest : public ::testing::Test { raft::update_device(input + 337, &val, 1, stream); } - void testMin() { - auto op = [] __device__(InType in) { return in; }; + void testMin() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::max(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare())); } - void testMax() { - auto op = [] __device__(InType in) { return in; }; + void testMax() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::min(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare())); } protected: @@ -178,8 +174,7 @@ class MapGenericReduceTest : public ::testing::Test { }; using IoTypePair = - ::testing::Types, std::pair, - std::pair>; + ::testing::Types, std::pair, std::pair>; TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aa46c78b0f..6ad9bfba10 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -32,8 +32,8 @@ struct MatVecOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MatVecOpInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) +{ return os; } @@ -41,26 +41,48 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, bool useTwoVectors, - cudaStream_t stream) { +void matrixVectorOpLaunch(T* out, + const T* in, + const T* vec1, + const T* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + bool useTwoVectors, + cudaStream_t stream) +{ if (useTwoVectors) { matrixVectorOp( - out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b, T c) { return a + b + c; }, stream); + out, + in, + vec1, + vec2, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, + stream); } else { matrixVectorOp( - out, in, vec1, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b) { return a + b; }, stream); + out, + in, + vec1, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, + stream); } } template -class MatVecOpTest - : public ::testing::TestWithParam> { +class MatVecOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); IdxType N = params.rows, D = params.cols; @@ -78,18 +100,25 @@ class MatVecOpTest r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); if (params.useTwoVectors) { - naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); } else { - naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); } - matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, - params.bcastAlongRows, params.useTwoVectors, stream); + matrixVectorOpLaunch(out, + in, + vec1, + vec2, + D, + N, + params.rowMajor, + params.bcastAlongRows, + params.useTwoVectors, + stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(vec1)); CUDA_CHECK(cudaFree(vec2)); CUDA_CHECK(cudaFree(out)); @@ -121,23 +150,23 @@ const std::vector> inputsf_i32 = { {0.00001f, 1024, 32, false, false, true, 1234ULL}, {0.00001f, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.00001f, 2500, 250, false, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, true, true, false, 1234ULL}, @@ -158,23 +187,23 @@ const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, false, false, true, 1234ULL}, {0.0000001, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.0000001, 2500, 250, false, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 69c45c9866..5f9c6f1ef3 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -22,9 +22,15 @@ namespace raft { namespace linalg { template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, - IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, - bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 1d3e753de3..6c38d89891 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -27,7 +27,8 @@ namespace linalg { template class MultiplyTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -43,7 +44,8 @@ class MultiplyTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -54,25 +56,21 @@ class MultiplyTest : public ::testing::TestWithParam> { T *in, *out_ref, *out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef MultiplyTest MultiplyTestF; -TEST_P(MultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(MultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf)); typedef MultiplyTest MultiplyTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(MultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index acc25addd0..35bc72dee4 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -34,17 +34,19 @@ struct NormInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { - os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " - << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; +::std::ostream& operator<<(::std::ostream& os, const NormInputs& I) +{ + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", " + << I.do_sqrt << ", " << I.seed << '}' << std::endl; return os; } ///// Row-wise norm test definitions template -__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { - Type acc = (Type)0; +__global__ void naiveRowNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ + Type acc = (Type)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { @@ -59,19 +61,20 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveRowNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveRowNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class RowNormTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -82,19 +85,18 @@ class RowNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, rows); raft::allocate(dots_act, rows); r.uniform(data, len, T(-1.0), T(1.0), stream); - naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, - stream); + naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, - fin_op); + rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); } else { rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -109,10 +111,11 @@ class RowNormTest : public ::testing::TestWithParam> { ///// Column-wise norm test definitisons template -__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { +__global__ void naiveColNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ int colID = threadIdx.x + blockIdx.x * blockDim.x; - if (colID > D) return; //avoid out-of-bounds thread + if (colID > D) return; // avoid out-of-bounds thread Type acc = 0; for (int i = 0; i < N; i++) { @@ -124,19 +127,20 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveColNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(D, TPB); - naiveColNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class ColNormTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -148,19 +152,18 @@ class ColNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, cols); raft::allocate(dots_act, cols); - naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, - stream); + naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, - fin_op); + colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -174,24 +177,23 @@ class ColNormTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = { - {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, - - {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; const std::vector> inputsd = { {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, @@ -213,22 +215,22 @@ const std::vector> inputsd = { {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; typedef RowNormTest RowNormTestF; -TEST_P(RowNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } typedef RowNormTest RowNormTestD; -TEST_P(RowNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd)); const std::vector> inputscf = { {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, @@ -269,22 +271,22 @@ const std::vector> inputscd = { {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; typedef ColNormTest ColNormTestF; -TEST_P(ColNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef ColNormTest ColNormTestD; -TEST_P(ColNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, - ::testing::ValuesIn(inputscf)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf)); -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, - ::testing::ValuesIn(inputscd)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9082397265..85c84777e4 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -34,8 +34,8 @@ struct ReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ReduceInputs& dims) +{ return os; } @@ -43,45 +43,55 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, - bool rowMajor, bool alongRows, bool inplace, - cudaStream_t stream) { - reduce( - dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, - [] __device__(InType in, int i) { return static_cast(in * in); }); +void reduceLaunch(OutType* dots, + const InType* data, + int cols, + int rows, + bool rowMajor, + bool alongRows, + bool inplace, + cudaStream_t stream) +{ + reduce(dots, + data, + cols, + rows, + (OutType)0, + rowMajor, + alongRows, + stream, + inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } template -class ReduceTest - : public ::testing::TestWithParam> { +class ReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); - params = - ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - outlen = params.alongRows ? rows : cols; + outlen = params.alongRows ? rows : cols; raft::allocate(data, len); raft::allocate(dots_exp, outlen); raft::allocate(dots_act, outlen); r.uniform(data, len, InType(-1.0), InType(1.0), stream); - naiveReduction(dots_exp, data, cols, rows, params.rowMajor, - params.alongRows, stream); + naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); // Perform reduction with default inplace = false first - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, - false, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, false, stream); // Add to result with inplace = true next, which shouldn't affect // in the case of coalescedReduction! if (!(params.rowMajor ^ params.alongRows)) { - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, - params.alongRows, true, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, true, stream); } } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -90,7 +100,7 @@ class ReduceTest protected: ReduceInputs params; - InType *data; + InType* data; OutType *dots_exp, *dots_act; int outlen; cudaStream_t stream; @@ -151,31 +161,31 @@ const std::vector> inputsfd = { {0.000002f, 1024, 256, false, false, 1234ULL}}; typedef ReduceTest ReduceTestFF; -TEST_P(ReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFF, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestDD; -TEST_P(ReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestDD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestFD; -TEST_P(ReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, - ::testing::ValuesIn(inputsff)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, - ::testing::ValuesIn(inputsdd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, - ::testing::ValuesIn(inputsfd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 30a9c2e271..86f9c2d8b8 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -26,52 +26,69 @@ namespace raft { namespace linalg { template -__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, - int D, int N) { - OutType acc = (OutType)0; +__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N) +{ + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += - static_cast(data[rowStart * D + i] * data[rowStart * D + i]); + acc += static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } template -void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { +void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel - <<>>(dots, data, D, N); + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel<<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } template -void unaryAndGemv(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { - //computes a MLCommon unary op on data (squares it), then computes Ax +void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ + // computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns thrust::device_vector sq(D * N); raft::linalg::unaryOp( - thrust::raw_pointer_cast(sq.data()), data, D * N, - [] __device__(InType v) { return static_cast(v * v); }, stream); + thrust::raw_pointer_cast(sq.data()), + data, + D * N, + [] __device__(InType v) { return static_cast(v * v); }, + stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] + thrust::device_vector ones(N, 1); // column vector [1...1] OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( - handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, - thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv(handle, + CUBLAS_OP_N, + D, + N, + &alpha, + thrust::raw_pointer_cast(sq.data()), + D, + thrust::raw_pointer_cast(ones.data()), + 1, + &beta, + dots, + 1, + stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } template -void naiveReduction(OutType *dots, const InType *data, int D, int N, - bool rowMajor, bool alongRows, cudaStream_t stream) { +void naiveReduction(OutType* dots, + const InType* data, + int D, + int N, + bool rowMajor, + bool alongRows, + cudaStream_t stream) +{ if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) { diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index b27fa2ac1a..57699cb050 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -32,17 +32,17 @@ struct stridedReductionInputs { }; template -void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream) { - stridedReduction(dots, data, cols, rows, (T)0, stream, false, - [] __device__(T in, int i) { return in * in; }); +void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream) +{ + stridedReduction( + dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; }); } template -class stridedReductionTest - : public ::testing::TestWithParam> { +class stridedReductionTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -50,16 +50,17 @@ class stridedReductionTest int len = rows * cols; raft::allocate(data, len); - raft::allocate(dots_exp, cols); //expected dot products (from test) - raft::allocate(dots_act, cols); //actual dot products (from prim) + raft::allocate(dots_exp, cols); // expected dot products (from test) + raft::allocate(dots_act, cols); // actual dot products (from prim) r.uniform(data, len, T(-1.0), T(1.0), - stream); //initialize matrix to random + stream); // initialize matrix to random unaryAndGemv(dots_exp, data, cols, rows, stream); stridedReductionLaunch(dots_act, data, cols, rows, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -72,35 +73,33 @@ class stridedReductionTest cudaStream_t stream; }; -const std::vector> inputsf = { - {0.00001f, 1024, 32, 1234ULL}, - {0.00001f, 1024, 64, 1234ULL}, - {0.00001f, 1024, 128, 1234ULL}, - {0.00001f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef stridedReductionTest stridedReductionTestF; -TEST_P(stridedReductionTestF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef stridedReductionTest stridedReductionTestD; -TEST_P(stridedReductionTestD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index ced3f65fdd..4295b91f3e 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -24,39 +24,34 @@ namespace raft { namespace linalg { template -__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, - const Type *in2, int len) { +__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] - in2[idx]; } } template -void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveSubtractElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, - const Type in2, int len) { +__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2; - } + if (idx < len) { out[idx] = in1[idx] - in2; } } template -void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, - cudaStream_t stream) { +void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSubtractScalarKernel - <<>>(out, in1, in2, len); + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -68,14 +63,16 @@ struct SubtractInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SubtractInputs& dims) +{ return os; } template class SubtractTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -98,7 +95,8 @@ class SubtractTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -110,35 +108,33 @@ class SubtractTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef SubtractTest SubtractTestF; -TEST_P(SubtractTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestF, Result) +{ + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); } typedef SubtractTest SubtractTestD; -TEST_P(SubtractTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestD, Result) +{ + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index fff321768f..e9e1a6dc02 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -35,19 +35,21 @@ struct SvdInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SvdInputs& dims) +{ return os; } template class SvdTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; cudaStream_t stream = handle.get_stream(); raft::allocate(data, len); @@ -56,7 +58,7 @@ class SvdTest : public ::testing::TestWithParam> { T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; raft::update_device(data, data_h, len, stream); - int left_evl = params.n_row * params.n_col; + int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; raft::allocate(left_eig_vectors_qr, left_evl); @@ -67,8 +69,7 @@ class SvdTest : public ::testing::TestWithParam> { // allocate(right_eig_vectors_trans_jacobi, right_evl); // allocate(sing_vals_jacobi, params.n_col); - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, - 0.488195, 0.110706, -0.865685}; + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685}; T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; @@ -78,18 +79,25 @@ class SvdTest : public ::testing::TestWithParam> { raft::allocate(right_eig_vectors_ref, right_evl); raft::allocate(sing_vals_ref, params.n_col); - raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, - stream); - raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, - right_evl, stream); + raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); + raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, right_evl, stream); raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream); - svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, - left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, + svdQR(handle, + data, + params.n_row, + params.n_col, + sing_vals_qr, + left_eig_vectors_qr, + right_eig_vectors_trans_qr, + true, + true, + true, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(left_eig_vectors_qr)); CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); @@ -101,69 +109,71 @@ class SvdTest : public ::testing::TestWithParam> { protected: SvdInputs params; - T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, - *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; + T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, + *right_eig_vectors_ref, *sing_vals_ref; }; -const std::vector> inputsf2 = { - {0.00001f, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00001, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}}; typedef SvdTest SvdTestValF; -TEST_P(SvdTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestValD; -TEST_P(SvdTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecF; -TEST_P(SvdTestLeftVecF, Result) { - ASSERT_TRUE(raft::devArrMatch( - left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, + left_eig_vectors_qr, + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecD; -TEST_P(SvdTestLeftVecD, Result) { - ASSERT_TRUE(raft::devArrMatch( - left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, + left_eig_vectors_qr, + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecF; -TEST_P(SvdTestRightVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, + right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecD; -TEST_P(SvdTestRightVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, + right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2)); // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, // ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index f10b029962..659bed04c6 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -34,14 +34,16 @@ struct TranposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const TranposeInputs& dims) +{ return os; } template class TransposeTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); stream = handle.get_stream(); @@ -63,7 +65,8 @@ class TransposeTest : public ::testing::TestWithParam> { transpose(data, params.n_row, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(data_trans)); CUDA_CHECK(cudaFree(data_trans_ref)); @@ -76,39 +79,33 @@ class TransposeTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.1f, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}}; -const std::vector> inputsd2 = { - {0.1, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}}; typedef TransposeTest TransposeTestValF; -TEST_P(TransposeTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data_trans, params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef TransposeTest TransposeTestValD; -TEST_P(TransposeTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data_trans, params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 666ab8619d..6349a1907a 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -28,28 +28,25 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ if (in == nullptr) { auto op = [scalar] __device__(OutType * ptr, IdxType idx) { *ptr = static_cast(scalar * idx); }; writeOnlyUnaryOp(out, len, op, stream); } else { - auto op = [scalar] __device__(InType in) { - return static_cast(in * scalar); - }; + auto op = [scalar] __device__(InType in) { return static_cast(in * scalar); }; unaryOp(out, in, len, op, stream); } } template -class UnaryOpTest - : public ::testing::TestWithParam> { +class UnaryOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - UnaryOpInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; @@ -59,7 +56,8 @@ class UnaryOpTest r.uniform(in, len, InType(-1.0), InType(1.0), stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(in)); @@ -67,18 +65,18 @@ class UnaryOpTest CUDA_CHECK(cudaFree(out)); } - virtual void DoTest() { - auto len = params.len; + virtual void DoTest() + { + auto len = params.len; auto scalar = params.scalar; naiveScale(out_ref, in, scalar, len, stream); unaryOpLaunch(out, in, scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } UnaryOpInputs params; - InType *in; + InType* in; OutType *out_ref, *out; cudaStream_t stream; }; @@ -86,14 +84,15 @@ class UnaryOpTest template class WriteOnlyUnaryOpTest : public UnaryOpTest { protected: - void DoTest() override { - auto len = this->params.len; + void DoTest() override + { + auto len = this->params.len; auto scalar = this->params.scalar; - naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream); - unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream); + naiveScale(this->out_ref, (OutType*)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out, (OutType*)nullptr, scalar, len, this->stream); CUDA_CHECK(cudaStreamSynchronize(this->stream)); - ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len, - CompareApprox(this->params.tolerance))); + ASSERT_TRUE(devArrMatch( + this->out_ref, this->out, this->params.len, CompareApprox(this->params.tolerance))); } }; @@ -101,8 +100,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { TEST_P(Name, Result) { DoTest(); } \ INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef UnaryOpTest UnaryOpTestF_i32; UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index be3f1124c5..3343389af8 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -24,8 +24,8 @@ namespace raft { namespace linalg { template -__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, - IdxType len) { +__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { if (in == nullptr) { @@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, } template -void naiveScale(OutType *out, const InType *in, InType scalar, int len, - cudaStream_t stream) { +void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveScaleKernel - <<>>(out, in, scalar, len); + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -56,8 +55,8 @@ struct UnaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const UnaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs& d) +{ return os; } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 578139623a..9cdd36b252 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -24,53 +24,51 @@ namespace raft { namespace matrix { template -__global__ void nativePowerKernel(Type *in, Type *out, int len) { +__global__ void nativePowerKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] * in[idx]; - } + if (idx < len) { out[idx] = in[idx] * in[idx]; } } template -void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { +void naivePower(Type* in, Type* out, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { +__global__ void nativeSqrtKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = sqrt(in[idx]); - } + if (idx < len) { out[idx] = sqrt(in[idx]); } } template -void naiveSqrt(Type *in, Type *out, int len) { +void naiveSqrt(Type* in, Type* out, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, - int colCount) { +__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount) +{ int d_i = blockIdx.x * rowCount; int end = d_i + rowCount; if (blockIdx.x < colCount) { - Type max = 0.0; + Type max = 0.0; int max_index = 0; for (int i = d_i; i < end; i++) { Type val = in[i]; - if (val < 0.0) { - val = -val; - } + if (val < 0.0) { val = -val; } if (val > max) { - max = val; + max = val; max_index = i; } } @@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, } template -void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { +void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) +{ naiveSignFlipKernel<<>>(in, out, rowCount, colCount); CUDA_CHECK(cudaPeekAtLastError()); } @@ -103,14 +102,16 @@ struct MathInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MathInputs& dims) +{ return os; } template class MathTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int len = params.len; @@ -154,7 +155,7 @@ class MathTest : public ::testing::TestWithParam> { allocate(in_recip_ref, 4); allocate(out_recip, 4); // default threshold is 1e-15 - std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; update_device(in_recip, in_recip_h.data(), 4, stream); update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream); @@ -165,7 +166,7 @@ class MathTest : public ::testing::TestWithParam> { reciprocal(in_recip, recip_scalar, 4, stream, true); - std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; allocate(in_smallzero, 4); allocate(out_smallzero, 4); @@ -177,7 +178,8 @@ class MathTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in_power)); CUDA_CHECK(cudaFree(out_power_ref)); CUDA_CHECK(cudaFree(in_sqrt)); @@ -196,137 +198,129 @@ class MathTest : public ::testing::TestWithParam> { protected: MathInputs params; - T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, - *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, - *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; + T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, + *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, + *out_smallzero_ref; }; -const std::vector> inputsf = { - {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd = { - {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; typedef MathTest MathPowerTestF; -TEST_P(MathPowerTestF, Result) { - ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathPowerTestD; -TEST_P(MathPowerTestD, Result) { - ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestF; -TEST_P(MathSqrtTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestD; -TEST_P(MathSqrtTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestF; -TEST_P(MathRatioTestF, Result) { - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestD; -TEST_P(MathRatioTestD, Result) { - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestF; -TEST_P(MathSignFlipTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestD; -TEST_P(MathSignFlipTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestF; -TEST_P(MathReciprocalTestF, Result) { - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestD; -TEST_P(MathReciprocalTestD, Result) { - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestF; -TEST_P(MathSetSmallZeroTestF, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestD; -TEST_P(MathSetSmallZeroTestD, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd)); } // namespace matrix } // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 28222c0697..fc5a418bda 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -32,14 +32,16 @@ struct MatrixInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatrixInputs& dims) +{ return os; } template class MatrixTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; @@ -54,13 +56,14 @@ class MatrixTest : public ::testing::TestWithParam> { // copy(in1, in1_revr, params.n_row, params.n_col); // colReverse(in1_revr, params.n_row, params.n_col); - T *outTrunc; + T* outTrunc; raft::allocate(outTrunc, 6); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); // CUDA_CHECK(cudaFree(in1_revr)); @@ -73,31 +76,30 @@ class MatrixTest : public ::testing::TestWithParam> { const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 4, 4, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 4, 4, 1234ULL}}; typedef MatrixTest MatrixTestF; -TEST_P(MatrixTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, - raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } typedef MatrixTest MatrixTestD; -TEST_P(MatrixTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, - raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2)); template class MatrixCopyRowsTest : public ::testing::Test { - using math_t = typename std::tuple_element<0, T>::type; - using idx_t = typename std::tuple_element<1, T>::type; + using math_t = typename std::tuple_element<0, T>::type; + using idx_t = typename std::tuple_element<1, T>::type; using idx_array_t = typename std::tuple_element<2, T>::type; protected: @@ -105,42 +107,38 @@ class MatrixCopyRowsTest : public ::testing::Test { : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n_cols * n_rows), indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) { + output(allocator, handle.get_stream(), n_cols * n_selected) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, - ptr); + thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, ptr); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testCopyRows() { - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, false); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(), - n_selected * n_cols, - raft::Compare())); - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, true); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(), - n_selected * n_cols, - raft::Compare())); + void testCopyRows() + { + copyRows( + input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare())); + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare())); } protected: - int n_rows = 10; - int n_cols = 3; + int n_rows = 10; + int n_cols = 3; int n_selected = 5; - idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; - math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, - 17, 19, 20, 23, 24, 27, 29}; - math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, - 14, 21, 22, 23, 27, 28, 29}; + idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; + math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29}; + math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; std::shared_ptr allocator; @@ -149,10 +147,10 @@ class MatrixCopyRowsTest : public ::testing::Test { raft::mr::device::buffer indices; }; -using TypeTuple = - ::testing::Types, std::tuple, - std::tuple, - std::tuple>; +using TypeTuple = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple); TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); } diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 223efdbfe8..9ba2c3332b 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -25,7 +25,8 @@ namespace raft { namespace mr { namespace device { -TEST(Raft, DeviceBufferAlloc) { +TEST(Raft, DeviceBufferAlloc) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -52,13 +53,14 @@ TEST(Raft, DeviceBufferAlloc) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceBufferZeroResize) { +TEST(Raft, DeviceBufferZeroResize) +{ // Create a limiting_resource_adaptor to track allocations - auto curr_mr = dynamic_cast( - rmm::mr::get_current_device_resource()); - auto limit_mr = std::make_shared< - rmm::mr::limiting_resource_adaptor>(curr_mr, - 1000); + auto curr_mr = + dynamic_cast(rmm::mr::get_current_device_resource()); + auto limit_mr = + std::make_shared>(curr_mr, + 1000); rmm::mr::set_current_device_resource(limit_mr.get()); diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index 953f65ddfb..aadf05285c 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -24,7 +24,8 @@ namespace raft { namespace mr { namespace host { -TEST(Raft, HostBuffer) { +TEST(Raft, HostBuffer) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceToHostBuffer) { +TEST(Raft, DeviceToHostBuffer) +{ auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK( - cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index d7aa76500b..5560c61c73 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -54,7 +54,8 @@ namespace mst { // Sequential prims function // Returns total weight of MST template -weight_t prims(CSRHost &csr_h) { +weight_t prims(CSRHost& csr_h) +{ auto n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; @@ -63,19 +64,18 @@ weight_t prims(CSRHost &csr_h) { for (auto i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = INT_MAX; + curr_edge[i] = INT_MAX; } curr_edge[0] = 0; // function to pick next min vertex-edge - auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, - auto n_vertices) { + auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) { weight_t min = INT_MAX; vertex_t min_vertex; for (auto v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { - min = curr_edge[v]; + min = curr_edge[v]; min_vertex = v; } } @@ -91,14 +91,13 @@ weight_t prims(CSRHost &csr_h) { active_vertex[curr_v] = true; // set to active // iterate through edges of current active vertex - auto edge_st = csr_h.offsets[curr_v]; + auto edge_st = csr_h.offsets[curr_v]; auto edge_end = csr_h.offsets[curr_v + 1]; for (auto e = edge_st; e < edge_end; e++) { // put edges to be considered for next iteration auto neighbor_idx = csr_h.indices[e]; - if (!active_vertex[neighbor_idx] && - csr_h.weights[e] < curr_edge[neighbor_idx]) { + if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) { curr_edge[neighbor_idx] = csr_h.weights[e]; } } @@ -114,99 +113,101 @@ weight_t prims(CSRHost &csr_h) { } template -class MSTTest - : public ::testing::TestWithParam> { +class MSTTest : public ::testing::TestWithParam> { protected: std::pair, raft::Graph_COO> - mst_gpu() { - edge_t *offsets = static_cast(csr_d.offsets.data()); - vertex_t *indices = static_cast(csr_d.indices.data()); - weight_t *weights = static_cast(csr_d.weights.data()); + mst_gpu() + { + edge_t* offsets = static_cast(csr_d.offsets.data()); + vertex_t* indices = static_cast(csr_d.indices.data()); + weight_t* weights = static_cast(csr_d.weights.data()); v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, - std::numeric_limits::max()); + rmm::device_vector mst_src(2 * v - 2, std::numeric_limits::max()); + rmm::device_vector mst_dst(2 * v - 2, std::numeric_limits::max()); rmm::device_vector color(v, 0); - vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); + vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); if (iterations == 0) { MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0); auto symmetric_result = symmetric_solver.solve(); MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } else { - MST_solver intermediate_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, iterations); + MST_solver intermediate_solver(handle, + offsets, + indices, + weights, + v, + e, + color_ptr, + handle.get_stream(), + true, + true, + iterations); auto intermediate_result = intermediate_solver.solve(); MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, false, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0); auto symmetric_result = symmetric_solver.solve(); // symmetric_result.n_edges += intermediate_result.n_edges; - auto total_edge_size = - symmetric_result.n_edges + intermediate_result.n_edges; + auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges; symmetric_result.src.resize(total_edge_size, handle.get_stream()); symmetric_result.dst.resize(total_edge_size, handle.get_stream()); symmetric_result.weights.resize(total_edge_size, handle.get_stream()); raft::copy(symmetric_result.src.data() + symmetric_result.n_edges, - intermediate_result.src.data(), intermediate_result.n_edges, + intermediate_result.src.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges, - intermediate_result.dst.data(), intermediate_result.n_edges, + intermediate_result.dst.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges, intermediate_result.weights.data(), - intermediate_result.n_edges, handle.get_stream()); + intermediate_result.n_edges, + handle.get_stream()); symmetric_result.n_edges = total_edge_size; MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } } - void SetUp() override { - mst_input = ::testing::TestWithParam< - MSTTestInput>::GetParam(); + void SetUp() override + { + mst_input = ::testing::TestWithParam>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = rmm::device_buffer( - mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); - csr_d.indices = rmm::device_buffer( - mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); - csr_d.weights = rmm::device_buffer( - mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); + csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), + handle.get_stream()); + csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), + handle.get_stream()); + csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), + handle.get_stream()); } void TearDown() override {} @@ -259,41 +260,68 @@ const std::vector> csr_in_h = { const std::vector> csr_in4_h = { {{0, 3, 5, 8, 10, 12, 14, 16}, {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; // singletons const std::vector> csr_in5_h = { {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16}, {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; typedef MSTTest MSTTestSequential; -TEST_P(MSTTestSequential, Sequential) { - auto results_pair = mst_gpu(); - auto &symmetric_result = results_pair.first; - auto &non_symmetric_result = results_pair.second; +TEST_P(MSTTestSequential, Sequential) +{ + auto results_pair = mst_gpu(); + auto& symmetric_result = results_pair.first; + auto& non_symmetric_result = results_pair.second; // do assertions here // in this case, running sequential MST auto prims_result = prims(mst_input.csr_h); - auto symmetric_sum = - thrust::reduce(thrust::device, symmetric_result.weights.data(), - symmetric_result.weights.data() + symmetric_result.n_edges); - auto non_symmetric_sum = thrust::reduce( - thrust::device, non_symmetric_result.weights.data(), - non_symmetric_result.weights.data() + non_symmetric_result.n_edges); - - ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, - raft::CompareApprox(0.1))); - ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, - raft::CompareApprox(0.1))); + auto symmetric_sum = thrust::reduce(thrust::device, + symmetric_result.weights.data(), + symmetric_result.weights.data() + symmetric_result.n_edges); + auto non_symmetric_sum = + thrust::reduce(thrust::device, + non_symmetric_result.weights.data(), + non_symmetric_result.weights.data() + non_symmetric_result.n_edges); + + ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox(0.1))); + ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox(0.1))); } -INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, - ::testing::ValuesIn(csr_in_h)); +INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h)); } // namespace mst } // namespace raft diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index af10dcab30..25c8fe5084 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -38,12 +38,13 @@ enum RandomType { }; template -__global__ void meanKernel(T* out, const T* data, int len) { +__global__ void meanKernel(T* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x + blockIdx.x * blockDim.x; - T val = tid < len ? data[tid] : T(0); - T x = BlockReduce(temp_storage).Sum(val); + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); __syncthreads(); T xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -70,7 +71,8 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -80,46 +82,30 @@ template template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Normal: - r.normal(data, params.len, params.start, params.end, stream); - break; - case RNG_LogNormal: - r.lognormal(data, params.len, params.start, params.end, stream); - break; - case RNG_Uniform: - r.uniform(data, params.len, params.start, params.end, stream); - break; - case RNG_Gumbel: - r.gumbel(data, params.len, params.start, params.end, stream); - break; - case RNG_Logistic: - r.logistic(data, params.len, params.start, params.end, stream); - break; - case RNG_Exp: - r.exponential(data, params.len, params.start, stream); - break; - case RNG_Rayleigh: - r.rayleigh(data, params.len, params.start, stream); - break; - case RNG_Laplace: - r.laplace(data, params.len, params.start, params.end, stream); - break; + case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); break; + case RNG_LogNormal: r.lognormal(data, params.len, params.start, params.end, stream); break; + case RNG_Uniform: r.uniform(data, params.len, params.start, params.end, stream); break; + case RNG_Gumbel: r.gumbel(data, params.len, params.start, params.end, stream); break; + case RNG_Logistic: r.logistic(data, params.len, params.start, params.end, stream); break; + case RNG_Exp: r.exponential(data, params.len, params.start, stream); break; + case RNG_Rayleigh: r.rayleigh(data, params.len, params.start, stream); break; + case RNG_Laplace: r.laplace(data, params.len, params.start, params.end, stream); break; }; static const int threads = 128; meanKernel - <<>>(stats, data, - params.len); + <<>>(stats, data, params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -127,23 +113,24 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { switch (params.type) { case RNG_Normal: meanvar[0] = params.start; meanvar[1] = params.end * params.end; break; case RNG_LogNormal: { - auto var = params.end * params.end; - auto mu = params.start; + auto var = params.end * params.end; + auto mu = params.start; meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = - (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -167,8 +154,7 @@ class RngTest : public ::testing::TestWithParam> { break; case RNG_Rayleigh: meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); - meanvar[1] = - ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: meanvar[0] = params.start; @@ -259,13 +245,12 @@ const std::vector> inputsf = { {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestF, Result) { +TEST_P(RngTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); @@ -321,13 +306,12 @@ const std::vector> inputsd = { {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestD, Result) { +TEST_P(RngTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); @@ -335,7 +319,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // Test for expected variance in mean calculations template -T quick_mean(const std::vector& d) { +T quick_mean(const std::vector& d) +{ T acc = T(0); for (const auto& di : d) { acc += di; @@ -344,8 +329,9 @@ T quick_mean(const std::vector& d) { } template -T quick_std(const std::vector& d) { - T acc = T(0); +T quick_std(const std::vector& d) +{ + T acc = T(0); T d_mean = quick_mean(d); for (const auto& di : d) { acc += ((di - d_mean) * (di - d_mean)); @@ -354,7 +340,8 @@ T quick_std(const std::vector& d) { } template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { +std::ostream& operator<<(std::ostream& out, const std::vector& v) +{ if (!v.empty()) { out << '['; std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); @@ -369,11 +356,12 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { // experiments computing the mean, giving us a distribution of the mean // itself. The mean error is simply the standard deviation of this // distribution (the standard deviation of the mean). -TEST(Rng, MeanError) { +TEST(Rng, MeanError) +{ timeb time_struct; ftime(&time_struct); - int seed = time_struct.millitm; - int num_samples = 1024; + int seed = time_struct.millitm; + int num_samples = 1024; int num_experiments = 1024; float* data; float* mean_result; @@ -391,10 +379,9 @@ TEST(Rng, MeanError) { Rng r(seed, rtype); r.normal(data, len, 3.3f, 0.23f, stream); // r.uniform(data, len, -1.0, 2.0); - raft::stats::mean(mean_result, data, num_samples, num_experiments, false, - false, stream); - raft::stats::stddev(std_result, data, mean_result, num_samples, - num_experiments, false, false, stream); + raft::stats::mean(mean_result, data, num_samples, num_experiments, false, false, stream); + raft::stats::stddev( + std_result, data, mean_result, num_samples, num_experiments, false, false, stream); std::vector h_mean_result(num_experiments); std::vector h_std_result(num_experiments); update_host(h_mean_result.data(), mean_result, num_experiments, stream); @@ -403,8 +390,8 @@ TEST(Rng, MeanError) { auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error - auto d_std_of_mean = quick_std(h_mean_result); - auto d_std = quick_mean(h_std_result); + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); // std::cout << "measured mean error: " << d_std_of_mean << "\n"; @@ -413,8 +400,7 @@ TEST(Rng, MeanError) { auto diff_expected_vs_measured_mean_error = std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); - ASSERT_TRUE( - (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(data)); @@ -427,7 +413,8 @@ TEST(Rng, MeanError) { template class ScaledBernoulliTest : public ::testing::Test { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); @@ -438,12 +425,12 @@ class ScaledBernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void rangeCheck() { + void rangeCheck() + { T* h_data = new T[len]; update_host(h_data, data, len, stream); - ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { - return a < -scale || a > scale; - })); + ASSERT_TRUE( + std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); delete[] h_data; } @@ -460,7 +447,8 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); } template class BernoulliTest : public ::testing::Test { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); allocate(data, len * sizeof(bool), stream); @@ -469,7 +457,8 @@ class BernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void trueFalseCheck() { + void trueFalseCheck() + { // both true and false values must be present bool* h_data = new bool[len]; update_host(h_data, data, len, stream); @@ -499,21 +488,21 @@ struct RngNormalTableInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const RngNormalTableInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs& dims) +{ return os; } template -class RngNormalTableTest - : public ::testing::TestWithParam> { +class RngNormalTableTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); - int len = params.rows * params.cols; + params = ::testing::TestWithParam>::GetParam(); + int len = params.rows * params.cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -523,11 +512,9 @@ class RngNormalTableTest allocate(mu_vec, params.cols); r.fill(mu_vec, params.cols, params.mu, stream); T* sigma_vec = nullptr; - r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, - params.sigma, stream); + r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, params.sigma, stream); static const int threads = 128; - meanKernel - <<>>(stats, data, len); + meanKernel<<>>(stats, data, len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; @@ -535,13 +522,15 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); CUDA_CHECK(cudaFree(mu_vec)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { meanvar[0] = params.mu; meanvar[1] = params.sigma * params.sigma; } @@ -562,16 +551,14 @@ const std::vector> inputsf_t = { {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestF, Result) { +TEST_P(RngNormalTableTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, - ::testing::ValuesIn(inputsf_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t)); typedef RngNormalTableTest RngNormalTableTestD; const std::vector> inputsd_t = { @@ -581,16 +568,14 @@ const std::vector> inputsd_t = { {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestD, Result) { +TEST_P(RngNormalTableTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, - ::testing::ValuesIn(inputsd_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t)); struct RngAffineInputs { int n; @@ -599,13 +584,15 @@ struct RngAffineInputs { class RngAffineTest : public ::testing::TestWithParam { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam::GetParam(); Rng r(params.seed); r.affine_transform_params(params.n, a, b); } - void check() { + void check() + { ASSERT_TRUE(gcd(a, params.n) == 1); ASSERT_TRUE(0 <= b && b < params.n); } @@ -616,13 +603,17 @@ class RngAffineTest : public ::testing::TestWithParam { }; // RngAffineTest const std::vector inputs_affine = { - {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, - {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, - {2568, 123456ULL}, {2568, 1234567890ULL}, + {100, 123456ULL}, + {100, 1234567890ULL}, + {101, 123456ULL}, + {101, 1234567890ULL}, + {7, 123456ULL}, + {7, 1234567890ULL}, + {2568, 123456ULL}, + {2568, 1234567890ULL}, }; TEST_P(RngAffineTest, Result) { check(); } -INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, - ::testing::ValuesIn(inputs_affine)); +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine)); } // namespace random } // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index 92f12206e8..c77c3df526 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -27,12 +27,13 @@ namespace random { enum RandomType { RNG_Uniform }; template -__global__ void meanKernel(float *out, const T *data, int len) { +__global__ void meanKernel(float* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; float val = tid < len ? data[tid] : T(0); - float x = BlockReduce(temp_storage).Sum(val); + float x = BlockReduce(temp_storage).Sum(val); __syncthreads(); float xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -59,14 +60,16 @@ struct RngInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); @@ -75,14 +78,11 @@ class RngTest : public ::testing::TestWithParam> { allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Uniform: - r.uniformInt(data, params.len, params.start, params.end, stream); - break; + case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); break; }; static const int threads = 128; meanKernel - <<>>(stats, data, - params.len); + <<>>(stats, data, params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -90,12 +90,14 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(float meanvar[2]) { + void getExpectedMeanVar(float meanvar[2]) + { switch (params.type) { case RNG_Uniform: meanvar[0] = (params.start + params.end) * 0.5f; @@ -107,8 +109,8 @@ class RngTest : public ::testing::TestWithParam> { protected: RngInputs params; - T *data; - float *stats; + T* data; + float* stats; float h_stats[2]; // mean, var }; @@ -120,13 +122,12 @@ const std::vector> inputs_u32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU32, Result) { +TEST_P(RngTestU32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); @@ -138,13 +139,12 @@ const std::vector> inputs_u64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU64, Result) { +TEST_P(RngTestU64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); @@ -156,13 +156,12 @@ const std::vector> inputs_s32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS32, Result) { +TEST_P(RngTestS32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); @@ -174,13 +173,12 @@ const std::vector> inputs_s64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS64, Result) { +TEST_P(RngTestS64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index d7e52a8958..c258841c3e 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -38,14 +38,16 @@ struct SWoRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) +{ return os; } template class SWoRTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); CUDA_CHECK(cudaStreamCreate(&stream)); @@ -58,15 +60,14 @@ class SWoRTest : public ::testing::TestWithParam> { r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); if (params.largeWeightIndex >= 0) { - update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, - stream); + update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, stream); } - r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, - params.len, stream); + r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, params.len, stream); update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(in)); @@ -147,14 +148,14 @@ const std::vector> inputsf = { {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestF, Result) { +TEST_P(SWoRTestF, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -162,9 +163,7 @@ TEST_P(SWoRTestF, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); @@ -231,14 +230,14 @@ const std::vector> inputsd = { {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestD, Result) { +TEST_P(SWoRTestD, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -246,9 +245,7 @@ TEST_P(SWoRTestD, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 713708d4cd..e1f814a5b6 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -44,14 +44,14 @@ struct CSRAddInputs { }; template -class CSRAddTest - : public ::testing::TestWithParam> { +class CSRAddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - n_rows = params.matrix_a.row_ind.size(); - nnz_a = params.matrix_a.row_ind_ptr.size(); - nnz_b = params.matrix_b.row_ind_ptr.size(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); nnz_result = params.matrix_verify.row_ind_ptr.size(); cudaStreamCreate(&stream); @@ -73,46 +73,61 @@ class CSRAddTest raft::allocate(values_result, nnz_result); } - void Run() { - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void Run() + { + std::shared_ptr alloc(new raft::mr::device::default_allocator); raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, - stream); + raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream); raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, - stream); + raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, stream); raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream); - raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), - nnz_result, stream); - raft::update_device(values_verify, params.matrix_verify.values.data(), - nnz_result, stream); - - Index_ nnz = linalg::csr_add_calc_inds( - ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, alloc, stream); + raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, stream); + raft::update_device( + ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), nnz_result, stream); + raft::update_device(values_verify, params.matrix_verify.values.data(), nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds(ind_a, + ind_ptr_a, + values_a, + nnz_a, + ind_b, + ind_ptr_b, + values_b, + nnz_b, + n_rows, + ind_result, + alloc, + stream); ASSERT_TRUE(nnz == nnz_result); - ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, - raft::Compare())); - - linalg::csr_add_finalize( - ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, ind_ptr_result, values_result, stream); - - ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(values_verify, values_result, nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, raft::Compare())); + + linalg::csr_add_finalize(ind_a, + ind_ptr_a, + values_a, + nnz_a, + ind_b, + ind_ptr_b, + values_b, + nnz_b, + n_rows, + ind_result, + ind_ptr_result, + values_result, + stream); + + ASSERT_TRUE( + raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(values_verify, values_result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ind_a)); CUDA_CHECK(cudaFree(ind_b)); CUDA_CHECK(cudaFree(ind_result)); @@ -131,8 +146,8 @@ class CSRAddTest CSRAddInputs params; cudaStream_t stream; Index_ n_rows, nnz_a, nnz_b, nnz_result; - Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, - *ind_ptr_verify, *ind_ptr_result; + Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, *ind_ptr_verify, + *ind_ptr_result; Type_f *values_a, *values_b, *values_verify, *values_result; }; @@ -165,10 +180,8 @@ const std::vector> csradd_inputs_d = { {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, }; -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, - ::testing::ValuesIn(csradd_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, - ::testing::ValuesIn(csradd_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index d98f9de9c3..3678d34bbe 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -51,26 +51,24 @@ struct ConnectComponentsInputs { }; template -class ConnectComponentsTest : public ::testing::TestWithParam< - ConnectComponentsInputs> { +class ConnectComponentsTest + : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); - params = ::testing::TestWithParam< - ConnectComponentsInputs>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); - raft::sparse::COO out_edges( - handle.get_device_allocator(), handle.get_stream()); + raft::sparse::COO out_edges(handle.get_device_allocator(), + handle.get_stream()); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); rmm::device_uvector indptr(params.n_row + 1, stream); @@ -79,44 +77,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam< */ raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c); + raft::sparse::selection::knn_graph(handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + knn_graph_coo, + params.c); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, d_alloc, stream); /** * 2. Construct MST, sorted by weights */ rmm::device_uvector colors(params.n_row, stream); - auto mst_coo = raft::mst::mst( - handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(), - params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true); + auto mst_coo = raft::mst::mst(handle, + indptr.data(), + knn_graph_coo.cols(), + knn_graph_coo.vals(), + params.n_row, + knn_graph_coo.nnz, + colors.data(), + stream, + false, + true); /** * 3. connect_components to fix connectivities */ - raft::linkage::FixConnectivitiesRedOp red_op( - colors.data(), params.n_row); + raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, - red_op); + handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); /** * Construct final edge list */ rmm::device_uvector indptr2(params.n_row + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, - indptr2.data(), params.n_row + 1, - d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, d_alloc, stream); - auto output_mst = raft::mst::mst( - handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, - out_edges.nnz, colors.data(), stream, false, false); + auto output_mst = raft::mst::mst(handle, + indptr2.data(), + out_edges.cols(), + out_edges.vals(), + params.n_row, + out_edges.nnz, + colors.data(), + stream, + false, + false); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -138,366 +150,199 @@ const std::vector> fix_conn_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, -1}, // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, -4}}; typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) { +TEST_P(ConnectComponentsTestF_Int, Result) +{ /** - * Verify the src & dst vertices on each edge have different colors - */ + * Verify the src & dst vertices on each edge have different colors + */ EXPECT_TRUE(final_edges == params.n_row - 1); } -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int, +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, + ConnectComponentsTestF_Int, ::testing::ValuesIn(fix_conn_inputsf2)); }; // namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index ea69ecfc53..2e4c2c1a14 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -39,7 +39,8 @@ struct CSRtoCOOInputs { template class CSRtoCOOTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); @@ -48,20 +49,21 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { raft::allocate(result, params.verify.size(), true); } - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.verify.size(); + Index_ nnz = params.verify.size(); raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_to_coo(ex_scan, n_rows, result, nnz, stream); - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, - raft::Compare(), stream)); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare(), stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -89,9 +91,11 @@ const std::vector> csrtocoo_inputs_64 = { {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestI, ::testing::ValuesIn(csrtocoo_inputs_32)); -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestL, ::testing::ValuesIn(csrtocoo_inputs_64)); } // namespace sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 553ef2ddee..b2878081ae 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -37,14 +37,13 @@ struct SparseConvertCSRInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseConvertCSRInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs& dims) +{ return os; } template -class SparseConvertCSRTest - : public ::testing::TestWithParam> { +class SparseConvertCSRTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -54,22 +53,21 @@ class SparseConvertCSRTest SparseConvertCSRInputs params; }; -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseConvertCSRTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) { +TEST_P(SortedCOOToCSR, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); int nnz = 8; int *in, *out, *exp; - int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *exp_h = new int[4]{0, 2, 4, 6}; + int* in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* exp_h = new int[4]{0, 2, 4, 6}; raft::allocate(in, nnz, true); raft::allocate(exp, 4, true); @@ -92,8 +90,7 @@ TEST_P(SortedCOOToCSR, Result) { CUDA_CHECK(cudaFree(out)); } -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf)); /******************************** adj graph ********************************/ @@ -107,10 +104,10 @@ struct CSRAdjGraphInputs { }; template -class CSRAdjGraphTest - : public ::testing::TestWithParam> { +class CSRAdjGraphTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); nnz = params.verify.size(); @@ -121,20 +118,21 @@ class CSRAdjGraphTest raft::allocate(verify, nnz); } - void Run() { + void Run() + { raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream); - raft::update_device(adj, reinterpret_cast(params.adj.data()), - params.n_rows * params.n_cols, stream); + raft::update_device( + adj, reinterpret_cast(params.adj.data()), params.n_rows * params.n_cols, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_adj_graph_batched( row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(row_ind)); CUDA_CHECK(cudaFree(adj)); CUDA_CHECK(cudaFree(verify)); @@ -147,7 +145,7 @@ class CSRAdjGraphTest cudaStream_t stream; Index_ nnz; Index_ *row_ind, *result, *verify; - bool *adj; + bool* adj; }; using CSRAdjGraphTestI = CSRAdjGraphTest; @@ -171,9 +169,11 @@ const std::vector> csradjgraph_inputs_l = { {0, 1, 2, 0, 1, 2, 0, 1, 2}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestI, ::testing::ValuesIn(csradjgraph_inputs_i)); -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestL, ::testing::ValuesIn(csradjgraph_inputs_l)); } // namespace sparse diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 625772a842..fe43f0d182 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -47,19 +47,19 @@ struct CSRRowSliceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRRowSliceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs& dims) +{ return os; } template -class CSRRowSliceTest - : public ::testing::TestWithParam> { +class CSRRowSliceTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -69,31 +69,27 @@ class CSRRowSliceTest update_device(indices, indices_h.data(), indices_h.size(), stream); update_device(data, data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), - stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRRowSliceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); @@ -101,18 +97,22 @@ class CSRRowSliceTest int csr_start_offset; int csr_stop_offset; - raft::sparse::op::csr_row_slice_indptr( - params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset, - &csr_stop_offset, stream); + raft::sparse::op::csr_row_slice_indptr(params.start_row, + params.stop_row, + indptr, + out_indptr, + &csr_start_offset, + &csr_stop_offset, + stream); - raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset, - indices, data, out_indices, - out_data, stream); + raft::sparse::op::csr_row_slice_populate( + csr_start_offset, csr_stop_offset, indices, data, out_indices, out_data, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -125,15 +125,14 @@ class CSRRowSliceTest CUDA_CHECK(cudaFree(out_data_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, - params.out_indptr_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, - params.out_indices_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_data, out_data_ref, - params.out_data_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); + ASSERT_TRUE( + devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); } protected: @@ -141,15 +140,15 @@ class CSRRowSliceTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_idx *out_indptr, *out_indices; - value_t *out_data; + value_t* out_data; // expected output data value_idx *out_indptr_ref, *out_indices_ref; - value_t *out_data_ref; + value_t* out_data_ref; CSRRowSliceInputs params; }; @@ -177,8 +176,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRRowSliceTest CSRRowSliceTestF; TEST_P(CSRRowSliceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 5535df4fe3..286493ada7 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -43,19 +43,19 @@ struct CSRToDenseInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRToDenseInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs& dims) +{ return os; } template -class CSRToDenseTest - : public ::testing::TestWithParam> { +class CSRToDenseTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -74,24 +74,24 @@ class CSRToDenseTest allocate(out, out_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRToDenseInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices, - data, params.nrows, out, stream, true); + convert::csr_to_dense( + handle, params.nrows, params.ncols, indptr, indices, data, params.nrows, out, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -100,9 +100,9 @@ class CSRToDenseTest CUDA_CHECK(cudaFree(out_ref)); } - void compare() { - ASSERT_TRUE( - devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); } protected: @@ -111,13 +111,13 @@ class CSRToDenseTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data - value_t *out; + value_t* out; // expected output data - value_t *out_ref; + value_t* out_ref; CSRToDenseInputs params; }; @@ -128,13 +128,26 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 2, 3, 0, 1, 2, 3}, // indices {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, - {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 16.0f, 2.0f}}, + {1.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 5.0f, + 50.0f, + 28.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 16.0f, + 2.0f}}, }; typedef CSRToDenseTest CSRToDenseTestF; TEST_P(CSRToDenseTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index c257d6eb3c..87b8b17073 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -49,19 +49,19 @@ struct CSRTransposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRTransposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs& dims) +{ return os; } template -class CSRTransposeTest - : public ::testing::TestWithParam> { +class CSRTransposeTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -71,45 +71,51 @@ class CSRTransposeTest update_device(indices, indices_h.data(), indices_h.size(), stream); update_device(data, data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), - stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRTransposeInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - raft::sparse::linalg::csr_transpose( - handle, indptr, indices, data, out_indptr, out_indices, out_data, - params.nrows, params.ncols, params.nnz, alloc, stream); + raft::sparse::linalg::csr_transpose(handle, + indptr, + indices, + data, + out_indptr, + out_indices, + out_data, + params.nrows, + params.ncols, + params.nnz, + alloc, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -122,15 +128,14 @@ class CSRTransposeTest CUDA_CHECK(cudaFree(out_data_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, - params.out_indptr_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, - params.out_indices_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_data, out_data_ref, - params.out_data_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); + ASSERT_TRUE( + devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); } protected: @@ -139,15 +144,15 @@ class CSRTransposeTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_idx *out_indptr, *out_indices; - value_t *out_data; + value_t* out_data; // expected output data value_idx *out_indptr_ref, *out_indices_ref; - value_t *out_data_ref; + value_t* out_data_ref; CSRTransposeInputs params; }; @@ -167,8 +172,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRTransposeTest CSRTransposeTestF; TEST_P(CSRTransposeTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 5d687ad92b..c6b2a27273 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -33,8 +33,7 @@ struct SparseDegreeInputs { }; template -class SparseDegreeTests - : public ::testing::TestWithParam> { +class SparseDegreeTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -47,11 +46,12 @@ class SparseDegreeTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; -TEST_P(COODegree, Result) { +TEST_P(COODegree, Result) +{ int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; - int verify_h[5] = {2, 1, 2, 0, 0}; + int verify_h[5] = {2, 1, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -70,16 +70,17 @@ TEST_P(COODegree, Result) { } typedef SparseDegreeTests COODegreeNonzero; -TEST_P(COODegreeNonzero, Result) { +TEST_P(COODegreeNonzero, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int *in_rows, *verify, *results; - float *in_vals; + float* in_vals; - int in_rows_h[5] = {0, 0, 1, 2, 2}; + int in_rows_h[5] = {0, 0, 1, 2, 2}; float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; - int verify_h[5] = {1, 0, 2, 0, 0}; + int verify_h[5] = {1, 0, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -101,10 +102,8 @@ TEST_P(COODegreeNonzero, Result) { CUDA_CHECK(cudaStreamDestroy(stream)); } -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index a83b93f83f..7c0db49a04 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -55,71 +55,82 @@ struct InputConfiguration { }; using dense_smem_strategy_t = dense_smem_strategy; -using hash_strategy_t = hash_strategy; +using hash_strategy_t = hash_strategy; template struct SparseDistanceCOOSPMVInputs { InputConfiguration input_configuration; float capacity_threshold = 0.5; - int map_size = hash_strategy::get_map_size(); + int map_size = hash_strategy::get_map_size(); }; template -::std::ostream &operator<<( - ::std::ostream &os, - const SparseDistanceCOOSPMVInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseDistanceCOOSPMVInputs& dims) +{ return os; } template class SparseDistanceCOOSPMVTest - : public ::testing::TestWithParam< - SparseDistanceCOOSPMVInputs> { + : public ::testing::TestWithParam> { public: SparseDistanceCOOSPMVTest() : dist_config(handle) {} - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config, params.capacity_threshold, params.map_size); } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config); } template - void compute_dist(reduce_f reduce_func, accum_f accum_func, - write_f write_func, bool rev = true) { - raft::mr::device::buffer coo_rows( - dist_config.handle.get_device_allocator(), - dist_config.handle.get_stream(), - max(dist_config.b_nnz, dist_config.a_nnz)); - - raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, - coo_rows.data(), dist_config.b_nnz, + void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) + { + raft::mr::device::buffer coo_rows(dist_config.handle.get_device_allocator(), + dist_config.handle.get_stream(), + max(dist_config.b_nnz, dist_config.a_nnz)); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, + dist_config.b_nrows, + coo_rows.data(), + dist_config.b_nnz, dist_config.handle.get_stream()); strategy_t selected_strategy = make_strategy(); - balanced_coo_pairwise_generalized_spmv( - out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + balanced_coo_pairwise_generalized_spmv(out_dists, + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); if (rev) { - raft::sparse::convert::csr_to_coo( - dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(), - dist_config.a_nnz, dist_config.handle.get_stream()); - - balanced_coo_pairwise_generalized_spmv_rev( - out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + raft::sparse::convert::csr_to_coo(dist_config.a_indptr, + dist_config.a_nrows, + coo_rows.data(), + dist_config.a_nnz, + dist_config.handle.get_stream()); + + balanced_coo_pairwise_generalized_spmv_rev(out_dists, + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); } } - void run_spmv() { + void run_spmv() + { switch (params.input_configuration.metric) { case raft::distance::DistanceType::InnerProduct: compute_dist(Product(), Sum(), AtomicAdd(), true); @@ -129,75 +140,69 @@ class SparseDistanceCOOSPMVTest break; case raft::distance::DistanceType::Canberra: compute_dist( - [] __device__(value_t a, value_t b) { - return fabsf(a - b) / (fabsf(a) + fabsf(b)); - }, - Sum(), AtomicAdd()); - break; - case raft::distance::DistanceType::L1: - compute_dist(AbsDiff(), Sum(), AtomicAdd()); - break; - case raft::distance::DistanceType::Linf: - compute_dist(AbsDiff(), Max(), AtomicMax()); + [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); }, + Sum(), + AtomicAdd()); break; + case raft::distance::DistanceType::L1: compute_dist(AbsDiff(), Sum(), AtomicAdd()); break; + case raft::distance::DistanceType::Linf: compute_dist(AbsDiff(), Max(), AtomicMax()); break; case raft::distance::DistanceType::LpUnexpanded: { - compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), - AtomicAdd()); + compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), AtomicAdd()); float p = 1.0f / params.input_configuration.metric_arg; raft::linalg::unaryOp( - out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows, + out_dists, + out_dists, + dist_config.a_nrows * dist_config.b_nrows, [=] __device__(value_t input) { return powf(input, p); }, dist_config.handle.get_stream()); } break; - default: - throw raft::exception("Unknown distance"); + default: throw raft::exception("Unknown distance"); } } protected: - void make_data() { - std::vector indptr_h = params.input_configuration.indptr_h; + void make_data() + { + std::vector indptr_h = params.input_configuration.indptr_h; std::vector indices_h = params.input_configuration.indices_h; - std::vector data_h = params.input_configuration.data_h; + std::vector data_h = params.input_configuration.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = - params.input_configuration.out_dists_ref_h; + std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - handle.get_stream()); + update_device( + out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); } - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.b_ncols = params.input_configuration.n_cols; - dist_config.b_nnz = params.input_configuration.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.b_ncols = params.input_configuration.n_cols; + dist_config.b_nnz = params.input_configuration.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.a_ncols = params.input_configuration.n_cols; - dist_config.a_nnz = params.input_configuration.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.a_ncols = params.input_configuration.n_cols; + dist_config.a_nnz = params.input_configuration.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -208,7 +213,8 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -217,8 +223,10 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, + out_dists, params.input_configuration.out_dists_ref_h.size(), CompareApprox(1e-3))); } @@ -228,7 +236,7 @@ class SparseDistanceCOOSPMVTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_t *out_dists, *out_dists_ref; @@ -243,8 +251,7 @@ const InputConfiguration input_inner_product = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}; @@ -275,384 +282,379 @@ const InputConfiguration input_l2_unexpanded = { raft::distance::DistanceType::L2Unexpanded, 0.0}; -const InputConfiguration input_canberra = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 3.3954660629919076, - 5.6469232737388815, - 6.373112846266441, - 4.0212880272531715, - 6.916281504639404, - 5.741508386786526, - 5.411470999663036, - 9.0, - 4.977014354725805, - 3.3954660629919076, - 0.0, - 7.56256082439209, - 5.540261147481582, - 4.832322929216881, - 4.62003193872216, - 6.498056792320361, - 4.309846252268695, - 6.317531174829905, - 6.016362684141827, - 5.6469232737388815, - 7.56256082439209, - 0.0, - 5.974878731322299, - 4.898357301336036, - 6.442097410320605, - 5.227077347287883, - 7.134101195584642, - 5.457753923371659, - 7.0, - 6.373112846266441, - 5.540261147481582, - 5.974878731322299, - 0.0, - 5.5507273748583, - 4.897749658726415, - 9.0, - 8.398776718824767, - 3.908281400328807, - 4.83431066343688, - 4.0212880272531715, - 4.832322929216881, - 4.898357301336036, - 5.5507273748583, - 0.0, - 6.632989819428174, - 7.438852294822894, - 5.6631570310967465, - 7.579428202635459, - 6.760811985364303, - 6.916281504639404, - 4.62003193872216, - 6.442097410320605, - 4.897749658726415, - 6.632989819428174, - 0.0, - 5.249404187382862, - 6.072559523278559, - 4.07661278488929, - 6.19678948003145, - 5.741508386786526, - 6.498056792320361, - 5.227077347287883, - 9.0, - 7.438852294822894, - 5.249404187382862, - 0.0, - 3.854811639654704, - 6.652724827169063, - 5.298236851430971, - 5.411470999663036, - 4.309846252268695, - 7.134101195584642, - 8.398776718824767, - 5.6631570310967465, - 6.072559523278559, - 3.854811639654704, - 0.0, - 7.529184598969917, - 6.903282911791188, - 9.0, - 6.317531174829905, - 5.457753923371659, - 3.908281400328807, - 7.579428202635459, - 4.07661278488929, - 6.652724827169063, - 7.529184598969917, - 0.0, - 7.0, - 4.977014354725805, - 6.016362684141827, - 7.0, - 4.83431066343688, - 6.760811985364303, - 6.19678948003145, - 5.298236851430971, - 6.903282911791188, - 7.0, - 0.0}, - raft::distance::DistanceType::Canberra, - 0.0}; - -const InputConfiguration input_lp_unexpanded = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 1.31462855332296, - 1.3690307816129905, - 1.698603990921237, - 1.3460470789553531, - 1.6636670712582544, - 1.2651744044972217, - 1.1938329352055201, - 1.8811409082590185, - 1.3653115050624267, - 1.31462855332296, - 0.0, - 1.9447722703291133, - 1.42818777206562, - 1.4685491458946494, - 1.3071999866010466, - 1.4988622861692171, - 0.9698559287406783, - 1.4972023224597841, - 1.5243383567266802, - 1.3690307816129905, - 1.9447722703291133, - 0.0, - 1.2748400840107568, - 1.0599569946448246, - 1.546591282841402, - 1.147526531928459, - 1.447002179128145, - 1.5982242387673176, - 1.3112533607072414, - 1.698603990921237, - 1.42818777206562, - 1.2748400840107568, - 0.0, - 1.038121552545461, - 1.011788365364402, - 1.3907391109256988, - 1.3128200942311496, - 1.19595706584447, - 1.3233328139624725, - 1.3460470789553531, - 1.4685491458946494, - 1.0599569946448246, - 1.038121552545461, - 0.0, - 1.3642741698145529, - 1.3493868683808095, - 1.394942694628328, - 1.572881849642552, - 1.380122665319464, - 1.6636670712582544, - 1.3071999866010466, - 1.546591282841402, - 1.011788365364402, - 1.3642741698145529, - 0.0, - 1.018961640373018, - 1.0114394258945634, - 0.8338711034820684, - 1.1247823842299223, - 1.2651744044972217, - 1.4988622861692171, - 1.147526531928459, - 1.3907391109256988, - 1.3493868683808095, - 1.018961640373018, - 0.0, - 0.7701238110357329, - 1.245486437864406, - 0.5551259549534626, - 1.1938329352055201, - 0.9698559287406783, - 1.447002179128145, - 1.3128200942311496, - 1.394942694628328, - 1.0114394258945634, - 0.7701238110357329, - 0.0, - 1.1886800117391216, - 1.0083692448135637, - 1.8811409082590185, - 1.4972023224597841, - 1.5982242387673176, - 1.19595706584447, - 1.572881849642552, - 0.8338711034820684, - 1.245486437864406, - 1.1886800117391216, - 0.0, - 1.3661374102525012, - 1.3653115050624267, - 1.5243383567266802, - 1.3112533607072414, - 1.3233328139624725, - 1.380122665319464, - 1.1247823842299223, - 0.5551259549534626, - 1.0083692448135637, - 1.3661374102525012, - 0.0}, - raft::distance::DistanceType::LpUnexpanded, - 2.0}; - -const InputConfiguration input_linf = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 0.9251771844789913, - 0.9036452083899731, - 0.9251771844789913, - 0.8706483735804971, - 0.9251771844789913, - 0.717493881903289, - 0.6920214832303888, - 0.9251771844789913, - 0.9251771844789913, - 0.9251771844789913, - 0.0, - 0.9036452083899731, - 0.8655339692155823, - 0.8706483735804971, - 0.8655339692155823, - 0.8655339692155823, - 0.6329837991017668, - 0.8655339692155823, - 0.8655339692155823, - 0.9036452083899731, - 0.9036452083899731, - 0.0, - 0.7988276152181608, - 0.7028075145996631, - 0.9036452083899731, - 0.9036452083899731, - 0.9036452083899731, - 0.8429599432532096, - 0.9036452083899731, - 0.9251771844789913, - 0.8655339692155823, - 0.7988276152181608, - 0.0, - 0.48376552205293305, - 0.8206394616536681, - 0.8206394616536681, - 0.8206394616536681, - 0.8429599432532096, - 0.8206394616536681, - 0.8706483735804971, - 0.8706483735804971, - 0.7028075145996631, - 0.48376552205293305, - 0.0, - 0.8706483735804971, - 0.8706483735804971, - 0.8706483735804971, - 0.8429599432532096, - 0.8706483735804971, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.0, - 0.8853924473642432, - 0.535821510936138, - 0.6497196601457607, - 0.8853924473642432, - 0.717493881903289, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.0, - 0.5279604218147174, - 0.6658348373853169, - 0.33799874888632914, - 0.6920214832303888, - 0.6329837991017668, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.535821510936138, - 0.5279604218147174, - 0.0, - 0.662579808115858, - 0.5079750812968089, - 0.9251771844789913, - 0.8655339692155823, - 0.8429599432532096, - 0.8429599432532096, - 0.8429599432532096, - 0.6497196601457607, - 0.6658348373853169, - 0.662579808115858, - 0.0, - 0.8429599432532096, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.33799874888632914, - 0.5079750812968089, - 0.8429599432532096, - 0.0}, - raft::distance::DistanceType::Linf, - 0.0}; - -const InputConfiguration input_l1 = { - 4, - {0, 1, 1, 2, 4}, - {3, 2, 0, 1}, // indices - {0.99296, 0.42180, 0.11687, 0.305869}, - { - // dense output - 0.0, - 0.99296, - 1.41476, - 1.415707, - 0.99296, - 0.0, - 0.42180, - 0.42274, - 1.41476, - 0.42180, - 0.0, - 0.84454, - 1.41570, - 0.42274, - 0.84454, - 0.0, - }, - raft::distance::DistanceType::L1, +const InputConfiguration input_canberra = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra, 0.0}; +const InputConfiguration input_lp_unexpanded = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}; + +const InputConfiguration input_linf = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf, + 0.0}; + +const InputConfiguration input_l1 = {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1, + 0.0}; + // test dense smem strategy -const std::vector< - SparseDistanceCOOSPMVInputs> - inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded}, - {input_canberra}, {input_lp_unexpanded}, - {input_linf}, {input_l1}}; +const std::vector> + inputs_dense_strategy = {{input_inner_product}, + {input_l2_unexpanded}, + {input_canberra}, + {input_lp_unexpanded}, + {input_linf}, + {input_l1}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestDenseStrategyF; @@ -662,22 +664,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, ::testing::ValuesIn(inputs_dense_strategy)); // test hash and chunk strategy -const std::vector> - inputs_hash_strategy = {{input_inner_product}, - {input_inner_product, 0.5, 2}, - {input_l2_unexpanded}, - {input_l2_unexpanded, 0.5, 2}, - {input_canberra}, - {input_canberra, 0.5, 2}, - {input_canberra, 0.5, 6}, - {input_lp_unexpanded}, - {input_lp_unexpanded, 0.5, 2}, - {input_lp_unexpanded, 0.5, 6}, - {input_linf}, - {input_linf, 0.5, 2}, - {input_linf, 0.5, 6}, - {input_l1}, - {input_l1, 0.5, 2}}; +const std::vector> inputs_hash_strategy = { + {input_inner_product}, + {input_inner_product, 0.5, 2}, + {input_l2_unexpanded}, + {input_l2_unexpanded, 0.5, 2}, + {input_canberra}, + {input_canberra, 0.5, 2}, + {input_canberra, 0.5, 6}, + {input_lp_unexpanded}, + {input_lp_unexpanded, 0.5, 2}, + {input_lp_unexpanded, 0.5, 6}, + {input_linf}, + {input_linf, 0.5, 2}, + {input_linf, 0.5, 6}, + {input_l1}, + {input_l1, 0.5, 2}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestHashStrategyF; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 0589637061..8d6675f954 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -50,8 +50,8 @@ struct SparseDistanceInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseDistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ return os; } @@ -61,24 +61,24 @@ class SparseDistanceTest public: SparseDistanceTest() : dist_config(handle) {} - void SetUp() override { - params = ::testing::TestWithParam< - SparseDistanceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -89,7 +89,8 @@ class SparseDistanceTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -98,33 +99,34 @@ class SparseDistanceTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, - params.out_dists_ref_h.size(), - CompareApprox(1e-3))); + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref, out_dists, params.out_dists_ref_h.size(), CompareApprox(1e-3))); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); std::vector out_dists_ref_h = params.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + update_device(out_dists_ref, + out_dists_ref_h.data(), + out_dists_ref_h.size(), dist_config.handle.get_stream()); } @@ -132,7 +134,7 @@ class SparseDistanceTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_t *out_dists, *out_dists_ref; @@ -187,8 +189,7 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}, {2, @@ -219,40 +220,33 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, - 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., - 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, - 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, - 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., - 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, - 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, - 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, - 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, - 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, - 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, - 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, - 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., - 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, - 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., - 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, 0.61547536, 0.68185144, 1., 0.}, raft::distance::DistanceType::CosineExpanded, 0.0}, {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, @@ -361,15 +355,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 3.3954660629919076, 5.6469232737388815, @@ -475,15 +467,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 1.31462855332296, 1.3690307816129905, @@ -589,15 +579,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 0.9251771844789913, 0.9036452083899731, @@ -703,17 +691,14 @@ const std::vector> inputs_i32_f = { {15, {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, - {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, - 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, - 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, - {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, - 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, - 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, - 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, - 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, - 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, - 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, - 0.26190054, 0.2077349, 0.10803964}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, @@ -772,31 +757,25 @@ const std::vector> inputs_i32_f = { {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, - {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, - 0.99584118, 0.76835667, 0.34426657, 0.2357925, 0.01274851, 0.11422017, - 0.3437756, 0.31967718, 0.5956055, 0.31610373, 0.04147273, 0.03724415, - 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, - 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, - 0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608, - 0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295, - 0.27366735, 0.5119944, 0.11416134}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, {// dense output - 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, - 0.76962708, 1.122858, 1.1232498, 1.08166081, 0.48769777, 0., - 1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055, - 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, - 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, - 0.26127048, 0.98318907, 1.82943642, 0., 0.29945563, 1.08494093, - 0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815, - 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, - 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, - 0.45060069, 0., 1.29899154, 1.40683824, 0.48505269, 0.53862363, - 0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154, - 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, - 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., - 1.47318624, 1.92660889, 1.1232498, 0.40658897, 0.60215168, 1.74288748, - 1.18328348, 0.48505269, 1.92108999, 1.47318624, 0., 0.24992619, - 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, 1.88812175, 1.92660889, 0.24992619, 0.}, raft::distance::DistanceType::CorrelationExpanded, 0.0}, @@ -805,12 +784,11 @@ const std::vector> inputs_i32_f = { {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., - 1., 1., 1., 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., - 1., 1., 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., - 1., 1., 1., 1., 0., 1., 0.8, 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., - 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, raft::distance::DistanceType::RusselRaoExpanded, 0.0}, {5, @@ -818,13 +796,12 @@ const std::vector> inputs_i32_f = { {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, - 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., - 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., - 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, - 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0., 0.2, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, raft::distance::DistanceType::HammingUnexpanded, 0.0}, {3, @@ -868,7 +845,8 @@ const std::vector> inputs_i32_f = { typedef SparseDistanceTest SparseDistanceTestF; TEST_P(SparseDistanceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // namespace distance diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index f7954f899f..02be95c8a8 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -36,8 +36,7 @@ struct SparseFilterInputs { }; template -class SparseFilterTests - : public ::testing::TestWithParam> { +class SparseFilterTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -50,14 +49,14 @@ class SparseFilterTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseFilterTests COORemoveZeros; -TEST_P(COORemoveZeros, Result) { +TEST_P(COORemoveZeros, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); - float *in_h_vals = new float[params.nnz]; + float* in_h_vals = new float[params.nnz]; COO in(alloc, stream, params.nnz, 5, 5); @@ -70,8 +69,8 @@ TEST_P(COORemoveZeros, Result) { in_h_vals[2] = 0; in_h_vals[3] = 0; - int *in_h_rows = new int[params.nnz]; - int *in_h_cols = new int[params.nnz]; + int* in_h_rows = new int[params.nnz]; + int* in_h_cols = new int[params.nnz]; for (int i = 0; i < params.nnz; i++) { in_h_rows[i] = params.nnz - i - 1; @@ -87,9 +86,9 @@ TEST_P(COORemoveZeros, Result) { int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; - float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); - out_vals_ref_h[0] = in_h_vals[4]; - out_vals_ref_h[1] = in_h_vals[1]; + float* out_vals_ref_h = (float*)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; COO out_ref(alloc, stream, 2, 5, 5); COO out(alloc, stream); @@ -100,12 +99,9 @@ TEST_P(COORemoveZeros, Result) { op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); - ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); free(out_vals_ref_h); @@ -115,8 +111,7 @@ TEST_P(COORemoveZeros, Result) { delete[] in_h_vals; } -INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 8c3bf36318..ca9da0bc05 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -50,39 +50,53 @@ struct SparseKNNInputs { int batch_size_index = 2; int batch_size_query = 2; - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded; }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseKNNInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ return os; } template -class SparseKNNTest - : public ::testing::TestWithParam> { +class SparseKNNTest : public ::testing::TestWithParam> { public: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); n_rows = params.indptr_h.size() - 1; - nnz = params.indices_h.size(); - k = params.k; + nnz = params.indices_h.size(); + k = params.k; make_data(); - raft::sparse::selection::brute_force_knn( - indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data, - nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle, - params.batch_size_index, params.batch_size_query, params.metric); + raft::sparse::selection::brute_force_knn(indptr, + indices, + data, + nnz, + n_rows, + params.n_cols, + indptr, + indices, + data, + nnz, + n_rows, + params.n_cols, + out_indices, + out_dists, + k, + handle, + params.batch_size_index, + params.batch_size_query, + params.metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); CUDA_CHECK(cudaFree(data)); @@ -92,39 +106,37 @@ class SparseKNNTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, - CompareApprox(1e-4))); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, - Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), handle.get_stream()); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - handle.get_stream()); + update_device( + out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); + update_device( + out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); @@ -136,14 +148,14 @@ class SparseKNNTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data - value_idx *out_indices; - value_t *out_dists; + value_idx* out_indices; + value_t* out_dists; - value_idx *out_indices_ref; - value_t *out_dists_ref; + value_idx* out_indices_ref; + value_t* out_dists_ref; SparseKNNInputs params; }; @@ -161,8 +173,7 @@ const std::vector> inputs_i32_f = { raft::distance::DistanceType::L2SqrtExpanded}}; typedef SparseKNNTest SparseKNNTestF; TEST_P(SparseKNNTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection }; // end namespace sparse diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..f660e68aa3 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -29,8 +29,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -50,22 +51,21 @@ struct KNNGraphInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const KNNGraphInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs& dims) +{ return os; } template -class KNNGraphTest - : public ::testing::TestWithParam> { - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); +class KNNGraphTest : public ::testing::TestWithParam> { + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); out = new raft::sparse::COO(alloc, stream); @@ -74,8 +74,7 @@ class KNNGraphTest update_device(X, params.X.data(), params.X.size(), stream); raft::sparse::selection::knn_graph( - handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, - *out); + handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); rmm::device_uvector sum(1, stream); @@ -91,7 +90,8 @@ class KNNGraphTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(X)); delete out; @@ -101,9 +101,9 @@ class KNNGraphTest cudaStream_t stream; // input data - raft::sparse::COO *out; + raft::sparse::COO* out; - value_t *X; + value_t* X; value_idx sum_h; @@ -115,13 +115,15 @@ const std::vector> knn_graph_inputs_fint = { {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}}; typedef KNNGraphTest KNNGraphTestF_int; -TEST_P(KNNGraphTestF_int, Result) { +TEST_P(KNNGraphTestF_int, Result) +{ // nnz should not be larger than twice m * k ASSERT_TRUE(out->nnz <= (params.m * params.k * 2)); ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int, +INSTANTIATE_TEST_CASE_P(KNNGraphTest, + KNNGraphTestF_int, ::testing::ValuesIn(knn_graph_inputs_fint)); } // namespace sparse diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index ce567e4298..0ca7cec4e9 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -55,45 +55,44 @@ struct LinkageInputs { * @param b: number of pairs of points that both the clusters have classified differently */ template -__global__ void computeTheNumerator(const T* firstClusterArray, - const T* secondClusterArray, uint64_t size, - uint64_t* a, uint64_t* b) { - //calculating the indices of pairs of datapoints compared by the current thread +__global__ void computeTheNumerator( + const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) +{ + // calculating the indices of pairs of datapoints compared by the current thread uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - //thread-local variables to count a and b + // thread-local variables to count a and b uint64_t myA = 0, myB = 0; if (i < size && j < size && j < i) { - //checking if the pair have been classified the same by both the clusters + // checking if the pair have been classified the same by both the clusters if (firstClusterArray[i] == firstClusterArray[j] && secondClusterArray[i] == secondClusterArray[j]) { ++myA; } - //checking if the pair have been classified differently by both the clusters + // checking if the pair have been classified differently by both the clusters else if (firstClusterArray[i] != firstClusterArray[j] && secondClusterArray[i] != secondClusterArray[j]) { ++myB; } } - //specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce + // specialize blockReduce for a 2D block of 1024 threads of type uint64_t + typedef cub::BlockReduce BlockReduce; - //Allocate shared memory for blockReduce + // Allocate shared memory for blockReduce __shared__ typename BlockReduce::TempStorage temp_storage; - //summing up thread-local counts specific to a block + // summing up thread-local counts specific to a block myA = BlockReduce(temp_storage).Sum(myA); __syncthreads(); myB = BlockReduce(temp_storage).Sum(myB); __syncthreads(); - //executed once per block + // executed once per block if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd((unsigned long long int*)a, myA); raft::myAtomicAdd((unsigned long long int*)b, myB); @@ -101,102 +100,105 @@ __global__ void computeTheNumerator(const T* firstClusterArray, } /** -* @brief Function to calculate RandIndex -* more info on rand index -* @param firstClusterArray: the array of classes of type T -* @param secondClusterArray: the array of classes of type T -* @param size: the size of the data points of type uint64_t -* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr -* @param stream: the cudaStream object -*/ + * @brief Function to calculate RandIndex + * more info on rand index + * @param firstClusterArray: the array of classes of type T + * @param secondClusterArray: the array of classes of type T + * @param size: the size of the data points of type uint64_t + * @param allocator: object that takes care of temporary device memory allocation of type + * std::shared_ptr + * @param stream: the cudaStream object + */ template -double compute_rand_index( - T* firstClusterArray, T* secondClusterArray, uint64_t size, - std::shared_ptr allocator, cudaStream_t stream) { - //rand index for size less than 2 is not defined +double compute_rand_index(T* firstClusterArray, + T* secondClusterArray, + uint64_t size, + std::shared_ptr allocator, + cudaStream_t stream) +{ + // rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - //allocating and initializing memory for a and b in the GPU + // allocating and initializing memory for a and b in the GPU raft::mr::device::buffer arr_buf(allocator, stream, 2); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - //kernel configuration + // kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), raft::ceildiv(size, numThreadsPerBlock.y)); - //calling the kernel - computeTheNumerator - <<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), - arr_buf.data() + 1); + // calling the kernel + computeTheNumerator<<>>( + firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - //synchronizing and updating the calculated values of a and b from device to host + // synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - //error handling + // error handling CUDA_CHECK(cudaGetLastError()); - //denominator + // denominator uint64_t nChooseTwo = size * (size - 1) / 2; - //calculating the rand_index + // calculating the rand_index return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); } template -::std::ostream& operator<<(::std::ostream& os, - const LinkageInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const LinkageInputs& dims) +{ return os; } template class LinkageTest : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); // Allocate result labels and expected labels on device raft::allocate(labels, params.n_row); raft::allocate(labels_ref, params.n_row); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, handle.get_stream()); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, - handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, handle.get_stream()); out_arrs.children = out_children.data(); - raft::hierarchy::single_linkage< - IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c, + raft::hierarchy::single_linkage( + handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + &out_arrs, + params.c, params.n_clusters); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = - compute_rand_index(labels, labels_ref, params.n_row, - handle.get_device_allocator(), handle.get_stream()); + score = compute_rand_index( + labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream()); } void SetUp() override { basicTest(); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(labels)); CUDA_CHECK(cudaFree(labels_ref)); } @@ -212,14 +214,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10, @@ -227,8 +227,7 @@ const std::vector> linkage_inputsf2 = { // // Test outlier points {9, 2, - {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, - 10, 50, 30, 5}, + {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5}, {6, 0, 5, 0, 0, 4, 3, 2, 1}, 7, -1}, @@ -236,14 +235,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == (n_points / 2) {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {1, 0, 4, 0, 0, 3, 2, 0, 2, 1}, 5, @@ -252,340 +249,173 @@ const std::vector> linkage_inputsf2 = { // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -598,6 +428,5 @@ const std::vector> linkage_inputsf2 = { typedef LinkageTest LinkageTestF_Int; TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, - ::testing::ValuesIn(linkage_inputsf2)); +INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2)); } // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 7adbbf8b9a..4897d8194b 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -39,12 +39,11 @@ struct CSRRowNormalizeInputs { }; template -class CSRRowNormalizeTest - : public ::testing::TestWithParam> { +class CSRRowNormalizeTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - CSRRowNormalizeInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); raft::allocate(in_vals, params.in_vals.size()); @@ -53,9 +52,10 @@ class CSRRowNormalizeTest raft::allocate(result, params.verify.size(), true); } - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.in_vals.size(); + Index_ nnz = params.in_vals.size(); raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(in_vals, params.in_vals.data(), nnz, stream); @@ -63,20 +63,18 @@ class CSRRowNormalizeTest switch (params.method) { case MAX: - linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, - result, stream); + linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream); break; case L1: - linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, - result, stream); + linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream); break; } - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(in_vals)); CUDA_CHECK(cudaFree(verify)); @@ -87,7 +85,7 @@ class CSRRowNormalizeTest protected: CSRRowNormalizeInputs params; cudaStream_t stream; - Index_ *ex_scan; + Index_* ex_scan; Type_f *in_vals, *result, *verify; }; @@ -118,9 +116,11 @@ const std::vector> csrnormalize_inputs_d = { {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestF, ::testing::ValuesIn(csrnormalize_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestD, ::testing::ValuesIn(csrnormalize_inputs_d)); } // namespace sparse diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 50b5dc5993..44098214d2 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -42,19 +42,19 @@ struct SparseReduceInputs { }; template -class SparseReduceTest - : public ::testing::TestWithParam> { +class SparseReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - SparseReduceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); } - void Run() { + void Run() + { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); rmm::device_uvector in_cols(params.in_cols.size(), stream); @@ -63,30 +63,29 @@ class SparseReduceTest rmm::device_uvector out_cols(params.out_cols.size(), stream); rmm::device_uvector out_vals(params.out_vals.size(), stream); - raft::update_device(in_rows.data(), params.in_rows.data(), - params.in_rows.size(), stream); - raft::update_device(in_cols.data(), params.in_cols.data(), - params.in_cols.size(), stream); - raft::update_device(in_vals.data(), params.in_vals.data(), - params.in_vals.size(), stream); - raft::update_device(out_rows.data(), params.out_rows.data(), - params.out_rows.size(), stream); - raft::update_device(out_cols.data(), params.out_cols.data(), - params.out_cols.size(), stream); - raft::update_device(out_vals.data(), params.out_vals.data(), - params.out_vals.size(), stream); + raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream); + raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream); + raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream); + raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream); + raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream); + raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); raft::sparse::COO out(d_alloc, stream); - raft::sparse::op::max_duplicates(handle, out, in_rows.data(), - in_cols.data(), in_vals.data(), - params.in_rows.size(), params.m, params.n); + raft::sparse::op::max_duplicates(handle, + out, + in_rows.data(), + in_cols.data(), + in_vals.data(), + params.in_rows.size(), + params.m, + params.n); ASSERT_TRUE(raft::devArrMatch( out_rows.data(), out.rows(), out.nnz, raft::Compare())); ASSERT_TRUE(raft::devArrMatch( out_cols.data(), out.cols(), out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, raft::Compare())); } void TearDown() override {} @@ -115,7 +114,8 @@ const std::vector> max_reduce_inputs_f = { 4}, }; -INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF, +INSTANTIATE_TEST_CASE_P(SparseReduceTest, + SparseReduceTestF, ::testing::ValuesIn(max_reduce_inputs_f)); } // namespace sparse diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index b64fa25883..feefa7baa3 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -38,43 +38,47 @@ struct CSRRowOpInputs { /** Wrapper to call csr_row_op because the enclosing function of a __device__ * lambda cannot have private ot protected access within the class. */ template -void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, - Type_f *result, cudaStream_t stream) { +void csr_row_op_wrapper( + const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream) +{ op::csr_row_op( - row_ind, n_rows, nnz, + row_ind, + n_rows, + nnz, [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { - for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; + for (Index_ i = start_idx; i < stop_idx; i++) + result[i] = row; }, stream); } template -class CSRRowOpTest - : public ::testing::TestWithParam> { +class CSRRowOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); n_rows = params.ex_scan.size(); - nnz = params.verify.size(); + nnz = params.verify.size(); raft::allocate(verify, nnz); raft::allocate(ex_scan, n_rows); raft::allocate(result, nnz, true); } - void Run() { + void Run() + { raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); csr_row_op_wrapper(ex_scan, n_rows, nnz, result, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -85,7 +89,7 @@ class CSRRowOpTest CSRRowOpInputs params; cudaStream_t stream; Index_ n_rows, nnz; - Index_ *ex_scan; + Index_* ex_scan; Type_f *result, *verify; }; @@ -102,10 +106,8 @@ const std::vector> csrrowop_inputs_d = { {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, - ::testing::ValuesIn(csrrowop_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, - ::testing::ValuesIn(csrrowop_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 46f2f6a844..5d3b2a8317 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -45,8 +45,9 @@ struct SparseSelectionInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSelectionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSelectionInputs& dims) +{ return os; } @@ -54,7 +55,8 @@ template class SparseSelectionTest : public ::testing::TestWithParam> { protected: - void make_data() { + void make_data() + { std::vector dists_h = params.dists_h; allocate(dists, n_rows * n_cols); @@ -63,42 +65,39 @@ class SparseSelectionTest allocate(inds, n_rows * n_cols); iota_fill(inds, n_rows, n_cols, stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); } - void SetUp() override { - params = ::testing::TestWithParam< - SparseSelectionInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; n_cols = params.n_cols; - k = params.k; + k = params.k; make_data(); - raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists, - out_indices, params.select_min, k, - stream); + raft::sparse::selection::select_k( + dists, inds, n_rows, n_cols, out_dists, out_indices, params.select_min, k, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(dists)); @@ -111,11 +110,10 @@ class SparseSelectionTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() { - ASSERT_TRUE( - devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, - Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); } protected: @@ -124,15 +122,15 @@ class SparseSelectionTest int n_rows, n_cols, k; // input data - value_t *dists; - value_idx *inds; + value_t* dists; + value_idx* inds; // output data - value_idx *out_indices; - value_t *out_dists; + value_idx* out_indices; + value_t* out_dists; - value_idx *out_indices_ref; - value_t *out_dists_ref; + value_idx* out_indices_ref; + value_t* out_dists_ref; SparseSelectionInputs params; }; @@ -149,7 +147,8 @@ const std::vector> inputs_i32_f = { true}}; typedef SparseSelectionTest SparseSelectionTestF; TEST_P(SparseSelectionTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, + SparseSelectionTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b9a8b849eb..e154d19d34 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -47,27 +47,27 @@ class SparseSortTest : public ::testing::TestWithParam> { const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseSortTest COOSort; -TEST_P(COOSort, Result) { +TEST_P(COOSort, Result) +{ int *in_rows, *in_cols, *verify; - float *in_vals; + float* in_vals; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); raft::allocate(in_vals, params.nnz); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); - int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); - int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); - int *verify_h = (int *)malloc(params.nnz * sizeof(int)); + int* in_rows_h = (int*)malloc(params.nnz * sizeof(int)); + int* in_cols_h = (int*)malloc(params.nnz * sizeof(int)); + int* verify_h = (int*)malloc(params.nnz * sizeof(int)); for (int i = 0; i < params.nnz; i++) { in_rows_h[i] = params.nnz - i - 1; - verify_h[i] = i; + verify_h[i] = i; in_cols_h[i] = i; } @@ -80,11 +80,9 @@ TEST_P(COOSort, Result) { raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, - stream); + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); delete[] in_rows_h; delete[] in_cols_h; diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d104028d2b..6a66daa769 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -29,8 +29,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -49,19 +50,21 @@ struct SparseSymmetrizeInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSymmetrizeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSymmetrizeInputs& dims) +{ return os; } template -class SparseSymmetrizeTest : public ::testing::TestWithParam< - SparseSymmetrizeInputs> { +class SparseSymmetrizeTest + : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -72,19 +75,19 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< update_device(data, data_h.data(), data_h.size(), stream); } - void SetUp() override { - params = ::testing::TestWithParam< - SparseSymmetrizeInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); make_data(); - value_idx m = params.indptr_h.size() - 1; - value_idx n = params.n_cols; + value_idx m = params.indptr_h.size() - 1; + value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); raft::mr::device::buffer coo_rows(alloc, stream, nnz); @@ -93,8 +96,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::sparse::COO out(alloc, stream); - raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, - n, coo_rows.size(), out); + raft::sparse::linalg::symmetrize( + handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); raft::mr::device::buffer sum(alloc, stream, 1); @@ -107,7 +110,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -119,7 +123,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; value_idx sum_h; @@ -133,8 +137,7 @@ struct COOSymmetrizeInputs { }; template -class COOSymmetrizeTest - : public ::testing::TestWithParam> { +class COOSymmetrizeTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -144,7 +147,8 @@ class COOSymmetrizeTest const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef COOSymmetrizeTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) { +TEST_P(COOSymmetrize, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); @@ -153,16 +157,14 @@ TEST_P(COOSymmetrize, Result) { int nnz = 8; - int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; - float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + int* in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; - int *exp_rows_h = - new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; - int *exp_cols_h = - new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; - float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, - 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float* exp_vals_h = + new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; COO in(alloc, stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); @@ -172,22 +174,19 @@ TEST_P(COOSymmetrize, Result) { COO out(alloc, stream); linalg::coo_symmetrize<32, float>( - &in, &out, - [] __device__(int row, int col, float val, float trans) { - return val + trans; - }, - alloc, stream); + &in, + &out, + [] __device__(int row, int col, float val, float trans) { return val + trans; }, + alloc, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); - ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, raft::Compare())); cudaStreamDestroy(stream); @@ -200,8 +199,7 @@ TEST_P(COOSymmetrize, Result) { delete[] exp_vals_h; } -INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf)); const std::vector> symm_inputs_fint = { // Test n_clusters == n_points @@ -221,7 +219,8 @@ const std::vector> symm_inputs_fint = { typedef SparseSymmetrizeTest SparseSymmetrizeTestF_int; TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int, +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, + SparseSymmetrizeTestF_int, ::testing::ValuesIn(symm_inputs_fint)); } // namespace sparse diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index def1f1685b..8d35960d6a 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -29,7 +29,8 @@ namespace knn { template class HaversineKNNTest : public ::testing::Test { protected: - void basicTest() { + void basicTest() + { auto alloc = std::make_shared(); // Allocate input @@ -44,31 +45,37 @@ class HaversineKNNTest : public ::testing::Test { raft::allocate(d_pred_D, n * n); // make testdata on host - std::vector h_train_inputs = { - 0.71113885, -1.29215058, 0.59613176, -2.08048115, - 0.74932804, -1.33634042, 0.51486728, -1.65962873, - 0.53154002, -1.47049808, 0.72891737, -1.54095137}; + std::vector h_train_inputs = {0.71113885, + -1.29215058, + 0.59613176, + -2.08048115, + 0.74932804, + -1.33634042, + 0.51486728, + -1.65962873, + 0.53154002, + -1.47049808, + 0.72891737, + -1.54095137}; h_train_inputs.resize(n); raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); - std::vector h_res_D = { - 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, - 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, - 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, - 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, - 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, - 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; + std::vector h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, + 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, + 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, + 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, + 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, + 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); - std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, - 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, - 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; + std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, + 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); - std::vector input_vec = {d_train_inputs}; + std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; cudaStream_t stream; @@ -82,7 +89,8 @@ class HaversineKNNTest : public ::testing::Test { void SetUp() override { basicTest(); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(d_train_inputs)); CUDA_CHECK(cudaFree(d_pred_I)); CUDA_CHECK(cudaFree(d_pred_D)); @@ -91,27 +99,26 @@ class HaversineKNNTest : public ::testing::Test { } protected: - value_t *d_train_inputs; + value_t* d_train_inputs; int n = 6; int d = 2; int k = 6; - value_idx *d_pred_I; - value_t *d_pred_D; + value_idx* d_pred_I; + value_t* d_pred_D; - value_idx *d_ref_I; - value_t *d_ref_D; + value_idx* d_ref_I; + value_t* d_ref_D; }; typedef HaversineKNNTest HaversineKNNTestF; -TEST_F(HaversineKNNTestF, Fit) { - ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, - raft::CompareApprox(1e-3))); - ASSERT_TRUE( - raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); +TEST_F(HaversineKNNTestF, Fit) +{ + ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, raft::CompareApprox(1e-3))); + ASSERT_TRUE(raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); } } // namespace knn diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 2b1ef89f7a..d4e35c9d54 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -31,18 +31,18 @@ struct KNNInputs { std::vector labels; }; -__global__ void build_actual_output(int *output, int n_rows, int k, - const int *idx_labels, - const int64_t *indices) { +__global__ void build_actual_output( + int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) +{ int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; - int ind = (int)indices[element]; + int ind = (int)indices[element]; output[element] = idx_labels[ind]; } -__global__ void build_expected_output(int *output, int n_rows, int k, - const int *labels) { +__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels) +{ int row = threadIdx.x + blockDim.x * blockIdx.x; if (row >= n_rows) return; @@ -55,25 +55,33 @@ __global__ void build_expected_output(int *output, int n_rows, int k, template class KNNTest : public ::testing::TestWithParam { protected: - void testBruteForce() { - raft::print_device_vector("Input array: ", input_, rows_ * cols_, - std::cout); + void testBruteForce() + { + raft::print_device_vector("Input array: ", input_, rows_ * cols_, std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_, rows_, - std::cout); + raft::print_device_vector("Labels array: ", search_labels_, rows_, std::cout); auto stream = handle_.get_stream(); raft::allocate(actual_labels_, rows_ * k_, true); raft::allocate(expected_labels_, rows_ * k_, true); - std::vector input_vec; + std::vector input_vec; std::vector sizes_vec; input_vec.push_back(input_); sizes_vec.push_back(rows_); - brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_, - indices_, distances_, k_, true, true); + brute_force_knn(handle_, + input_vec, + sizes_vec, + cols_, + search_data_, + rows_, + indices_, + distances_, + k_, + true, + true); build_actual_output<<>>( actual_labels_, rows_, k_, search_labels_, indices_); @@ -81,24 +89,20 @@ class KNNTest : public ::testing::TestWithParam { build_expected_output<<>>( expected_labels_, rows_, k_, search_labels_); - raft::print_device_vector("Output indices: ", indices_, rows_ * k_, - std::cout); - raft::print_device_vector("Output distances: ", distances_, rows_ * k_, - std::cout); - raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, - std::cout); - raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, - std::cout); - - ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, - raft::Compare())); + raft::print_device_vector("Output indices: ", indices_, rows_ * k_, std::cout); + raft::print_device_vector("Output distances: ", distances_, rows_ * k_, std::cout); + raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, std::cout); + raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout); + + ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, raft::Compare())); } - void SetUp() override { + void SetUp() override + { params_ = ::testing::TestWithParam::GetParam(); - rows_ = params_.input.size(); - cols_ = params_.input[0].size(); - k_ = params_.k; + rows_ = params_.input.size(); + cols_ = params_.input[0].size(); + k_ = params_.k; std::vector row_major_input; for (int i = 0; i < params_.input.size(); ++i) { @@ -107,14 +111,12 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), - handle_.get_stream()); - float *input_ptr = static_cast(input_d.data()); + row_major_input.data(), row_major_input.size() * sizeof(float), handle_.get_stream()); + float* input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), - handle_.get_stream()); - int *labels_ptr = static_cast(labels_d.data()); + params_.labels.data(), params_.labels.size() * sizeof(int), handle_.get_stream()); + int* labels_ptr = static_cast(labels_d.data()); raft::allocate(input_, rows_ * cols_, true); raft::allocate(search_data_, rows_ * cols_, true); @@ -127,7 +129,8 @@ class KNNTest : public ::testing::TestWithParam { raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(search_data_)); CUDA_CHECK(cudaFree(indices_)); CUDA_CHECK(cudaFree(distances_)); @@ -139,15 +142,15 @@ class KNNTest : public ::testing::TestWithParam { KNNInputs params_; int rows_; int cols_; - float *input_; - float *search_data_; - int64_t *indices_; - float *distances_; + float* input_; + float* search_data_; + int64_t* indices_; + float* distances_; int k_; - int *search_labels_; - int *actual_labels_; - int *expected_labels_; + int* search_labels_; + int* actual_labels_; + int* expected_labels_; }; const std::vector inputs = { diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index e5c2d52764..2d7d713717 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -32,7 +32,8 @@ struct csr_view_t { index_type number_of_edges; }; } // namespace -TEST(Raft, SpectralMatrices) { +TEST(Raft, SpectralMatrices) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -49,19 +50,18 @@ TEST(Raft, SpectralMatrices) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{h, csr_v}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); - auto stream = h.get_stream(); + auto stream = h.get_stream(); auto t_exe_pol = thrust::cuda::par.on(stream); auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args @@ -71,8 +71,7 @@ TEST(Raft, SpectralMatrices) { EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 4a3b0ed196..8eb2f91952 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,14 +35,16 @@ struct MeanInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) +{ return os; } template class MeanTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -59,13 +61,15 @@ class MeanTest : public ::testing::TestWithParam> { meanSGtest(data, stream); } - void meanSGtest(T *data, cudaStream_t stream) { + void meanSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); } @@ -78,52 +82,52 @@ class MeanTest : public ::testing::TestWithParam> { // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) -const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; - -const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; +const std::vector> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; typedef MeanTest MeanTestF; -TEST_P(MeanTestF, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); } typedef MeanTest MeanTestD; -TEST_P(MeanTestD, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 8b0d607561..67df0def05 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -34,16 +34,16 @@ struct MeanCenterInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MeanCenterInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs& dims) +{ return os; } template -class MeanCenterTest - : public ::testing::TestWithParam> { +class MeanCenterTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -51,7 +51,7 @@ class MeanCenterTest CUDA_CHECK(cudaStreamCreate(&stream)); auto rows = params.rows, cols = params.cols; - auto len = rows * cols; + auto len = rows * cols; IdxType vecLen = params.bcastAlongRows ? cols : rows; raft::allocate(out, len); @@ -59,16 +59,15 @@ class MeanCenterTest raft::allocate(data, len); raft::allocate(meanVec, vecLen); r.normal(data, len, params.mean, (T)1.0, stream); - raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, - stream); - meanCenter(out, data, meanVec, cols, rows, params.rowMajor, - params.bcastAlongRows, stream); - raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, - params.rowMajor, params.bcastAlongRows, (T)-1.0); + raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream); + meanCenter(out, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, stream); + raft::linalg::naiveMatVec( + out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(out)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(data)); @@ -106,12 +105,11 @@ const std::vector> inputsf_i32 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; -TEST_P(MeanCenterTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, @@ -139,12 +137,11 @@ const std::vector> inputsf_i64 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; -TEST_P(MeanCenterTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -172,12 +169,12 @@ const std::vector> inputsd_i32 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; -TEST_P(MeanCenterTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -205,12 +202,12 @@ const std::vector> inputsd_i64 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; -TEST_P(MeanCenterTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index ff2698788f..8b7f75171b 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -34,14 +34,16 @@ struct StdDevInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) +{ return os; } template class StdDevTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -58,21 +60,21 @@ class StdDevTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void stdVarSGtest(T *data, cudaStream_t stream) { + void stdVarSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); - stddev(stddev_act, data, mean_act, cols, rows, params.sample, - params.rowMajor, stream); + stddev(stddev_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); - vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, - stream); + vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); raft::matrix::seqRoot(vars_act, T(1), cols, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); CUDA_CHECK(cudaFree(stddev_act)); @@ -121,28 +123,28 @@ const std::vector> inputsd = { {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; typedef StdDevTest StdDevTestF; -TEST_P(StdDevTestF, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); } typedef StdDevTest StdDevTestD; -TEST_P(StdDevTestD, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index c3140d4588..89e81708cc 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -32,15 +32,17 @@ struct SumInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) +{ return os; } template class SumTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; cudaStream_t stream; @@ -59,7 +61,8 @@ class SumTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(sum_act)); } @@ -76,15 +79,17 @@ const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, {0.05, 1024, 256, 1234ULL}}; typedef SumTest SumTestF; -TEST_P(SumTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + float(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef SumTest SumTestD; -TEST_P(SumTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + double(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index b8e8fe3fa0..ca09d9c855 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -25,15 +25,16 @@ namespace raft { template struct Compare { - bool operator()(const T &a, const T &b) const { return a == b; } + bool operator()(const T& a, const T& b) const { return a == b; } }; template struct CompareApprox { CompareApprox(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(a - b); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); @@ -46,9 +47,10 @@ struct CompareApprox { template struct CompareApproxAbs { CompareApproxAbs(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(abs(a) - abs(b)); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); } @@ -58,25 +60,26 @@ struct CompareApproxAbs { }; template -T abs(const T &a) { +T abs(const T& a) +{ return a > T(0) ? a : -a; } /* - * @brief Helper function to compare 2 device n-D arrays with custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value(s) - * @param actual actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - * @{ - */ + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -86,16 +89,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { - return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i; + return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i; } } return testing::AssertionSuccess(); } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, - L eq_compare, cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -110,9 +113,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, } template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t rows, size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch(const T* expected, + const T* actual, + size_t rows, + size_t cols, + L eq_compare, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); @@ -126,8 +133,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto act = act_h.get()[idx]; if (!eq_compare(exp, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i << "," - << j; + << "actual=" << act << " != expected=" << exp << " @" << i << "," << j; } } } @@ -135,9 +141,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -148,8 +154,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -157,24 +162,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, } /* - * @brief Helper function to compare a device n-D arrays with an expected array - * on the host, using a custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected_h host array of expected value(s) - * @param actual_d device array actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchHost( + const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - bool ok = true; + bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { auto exp = expected_h[i]; @@ -189,19 +194,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, } /* - * @brief Helper function to compare diagonal values of a 2D matrix - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value along diagonal - * @param actual actual matrix - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult diagonalMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -213,8 +218,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -222,10 +226,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, } template -testing::AssertionResult match(const T expected, T actual, L eq_compare) { +testing::AssertionResult match(const T expected, T actual, L eq_compare) +{ if (!eq_compare(expected, actual)) { - return testing::AssertionFailure() - << "actual=" << actual << " != expected=" << expected; + return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected; } return testing::AssertionSuccess(); }