diff --git a/cpp/.clang-format b/cpp/.clang-format index 779ca0033a..0c05436e92 100644 --- a/cpp/.clang-format +++ b/cpp/.clang-format @@ -1,72 +1,78 @@ --- # Refer to the following link for the explanation of each params: -# http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html -Language: Cpp -# BasedOnStyle: Google +# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true AlignEscapedNewlines: Left -AlignOperands: true +AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: All AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false # This is deprecated AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: true -BinPackParameters: true +BinPackArguments: false +BinPackParameters: false BraceWrapping: - AfterClass: false + AfterClass: false AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false # disabling the below splits, else, they'll just add to the vertical length of source files! SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: false BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach +BreakBeforeBraces: WebKit BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false +BreakInheritanceList: BeforeColon BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform ConstructorInitializerIndentWidth: 2 ContinuationIndentWidth: 2 Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false +DerivePointerAlignment: false +DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true -ForEachMacros: +ForEachMacros: - foreach - Q_FOREACH - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: +IncludeBlocks: Preserve +IncludeCategories: - Regex: '^' Priority: 2 - Regex: '^<.*\.h>' @@ -100,9 +106,9 @@ PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left -RawStringFormats: - - Language: Cpp - Delimiters: +RawStringFormats: + - Language: Cpp + Delimiters: - cc - CC - cpp @@ -111,7 +117,7 @@ RawStringFormats: - 'c++' - 'C++' CanonicalDelimiter: '' - - Language: TextProto + - Language: TextProto Delimiters: - pb - PB @@ -126,10 +132,10 @@ RawStringFormats: - ParseTextOrDie - ParseTextProtoOrDie CanonicalDelimiter: '' - BasedOnStyle: google + BasedOnStyle: google # Enabling comment reflow causes doxygen comments to be messed up in their formats! -ReflowComments: false -SortIncludes: true +ReflowComments: true +SortIncludes: true SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true @@ -139,19 +145,20 @@ SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 -SpacesInAngles: false +SpacesInAngles: false +SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -# We are C++14, but clang-format puts this under `Cpp11` itself -Standard: Cpp11 -StatementMacros: +Standard: c++17 +StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION # Be consistent with indent-width, even for people who use tab for indentation! -TabWidth: 2 -UseTab: Never -... +TabWidth: 2 +UseTab: Never diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index f380d276b2..08f836d3a8 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -21,7 +21,8 @@ namespace raft { /* Function for testing RAFT include * * @return message indicating RAFT has been included succesfully*/ -inline std::string test_raft() { +inline std::string test_raft() +{ std::string status = "RAFT Setup succesfully"; return status; } diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index a65227c402..dc9327bb94 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -42,17 +42,16 @@ namespace cache { * @param [out] out vectors collected from the cache, size [n_vec * n] */ template -__global__ void get_vecs(const math_t *cache, int_t n_vec, - const idx_t *cache_idx, int_t n, math_t *out) { +__global__ void get_vecs( + const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx + size_t out_col = tid / n_vec; // col idx size_t cache_col = cache_idx[out_col]; if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { - out[tid] = cache[row + cache_col * n_vec]; - } + if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } } } } @@ -84,21 +83,26 @@ __global__ void get_vecs(const math_t *cache, int_t n_vec, * @param [in] n_cache_vecs */ template -__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec, - const int *tile_idx, int n, const int *cache_idx, - math_t *cache, int n_cache_vecs) { +__global__ void store_vecs(const math_t* tile, + int n_tile, + int n_vec, + const int* tile_idx, + int n, + const int* cache_idx, + math_t* cache, + int n_cache_vecs) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; + int tile_col = tid / n_vec; // col idx + int data_col = tile_idx ? tile_idx[tile_col] : tile_col; int cache_col = cache_idx[tile_col]; // We ignore negative values. The rest of the checks should be fulfilled // if the cache is used properly if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = - tile[row + (size_t)data_col * n_vec]; + cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; } } } @@ -121,14 +125,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } * @return the index of the first element in the array for which * array[idx] >= value. If there is no such value, then return n. */ -int DI arg_first_ge(const int *array, int n, int val) { +int DI arg_first_ge(const int* array, int n, int val) +{ int start = 0; - int end = n - 1; + int end = n - 1; if (array[0] == val) return 0; if (array[end] < val) return n; while (start + 1 < end) { int q = (start + end + 1) / 2; - //invariants: + // invariants: // start < end // start < q <=end // array[start] < val && array[end] <=val @@ -157,7 +162,8 @@ int DI arg_first_ge(const int *array, int n, int val) { * @return the idx of the k-th occurance of val in array, or -1 if * the value is not found. */ -int DI find_nth_occurrence(const int *array, int n, int val, int k) { +int DI find_nth_occurrence(const int* array, int n, int val, int k) +{ int q = arg_first_ge(array, n, val); if (q + k < n && array[q + k] == val) { q += k; @@ -196,10 +202,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) { * Each block should give a different pointer for rank. */ template -DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { +DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) +{ const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort - BlockRadixSort; + typedef cub::BlockRadixSort BlockRadixSort; __shared__ typename BlockRadixSort::TempStorage temp_storage; int key[items_per_thread]; @@ -208,8 +214,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { int block_offset = blockIdx.x * associativity; for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; + int k = threadIdx.x + j * nthreads; + int t = (k < associativity) ? cache_time[block_offset + k] : 32768; key[j] = t; val[j] = k; } @@ -217,9 +223,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { BlockRadixSort(temp_storage).Sort(key, val); for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { - rank[val[j]] = threadIdx.x * items_per_thread + j; - } + if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } } __syncthreads(); } @@ -252,9 +256,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { * not be cached, size [n] */ template -__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, - int *cached_keys, int n_cache_sets, - int *cache_time, int time, int *cache_idx) { +__global__ void assign_cache_idx(const int* keys, + int n, + const int* cache_set, + int* cached_keys, + int n_cache_sets, + int* cache_time, + int time, + int* cache_idx) +{ int block_offset = blockIdx.x * associativity; const int items_per_thread = raft::ceildiv(associativity, nthreads); @@ -273,7 +283,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, // these elements are assigned -1. for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; + int i = threadIdx.x + j * nthreads; int t_idx = block_offset + i; bool mask = (i < associativity); // whether this slot is available for writing @@ -284,10 +294,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, if (mask) { int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); if (k > -1) { - int key_val = keys[k]; + int key_val = keys[k]; cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; + cache_idx[k] = t_idx; + cache_time[t_idx] = time; } } } @@ -315,21 +325,28 @@ namespace { * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] * @param [in] n_cache_sets number of cache sets * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity] + * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * + * associativity] * @param [out] cache_idx cache indices of the working set elements, size [n] * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ void get_cache_idx(int *keys, int n, int *cached_keys, - int n_cache_sets, int associativity, - int *cache_time, int *cache_idx, bool *is_cached, - int time) { +__global__ void get_cache_idx(int* keys, + int n, + int* cached_keys, + int n_cache_sets, + int associativity, + int* cache_time, + int* cache_idx, + bool* is_cached, + int time) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; + int widx = keys[tid]; + int sidx = hash(widx, n_cache_sets); + int cidx = sidx * associativity; + int i = 0; bool found = false; // search for empty spot and the least recently used spot while (i < associativity && !found) { @@ -338,9 +355,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys, } is_cached[tid] = found; if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; //update time stamp - cache_idx[tid] = cidx; //exact cache idx + cidx = cidx + i - 1; + cache_time[cidx] = time; // update time stamp + cache_idx[tid] = cidx; // exact cache idx } else { cache_idx[tid] = sidx; // assign cache set } diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8e3519fea5..32a46968b6 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -22,28 +22,32 @@ namespace raft { /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream - */ + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ template -void sortPairs(rmm::device_uvector &workspace, const KeyT *inKeys, - KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, - cudaStream_t stream) { +void sortPairs(rmm::device_uvector& workspace, + const KeyT* inKeys, + KeyT* outKeys, + const ValueT* inVals, + ValueT* outVals, + int len, + cudaStream_t stream) +{ size_t worksize; - cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, - outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs( + nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, - inVals, outVals, len, 0, sizeof(KeyT) * 8, - stream); + cub::DeviceRadixSort::SortPairs( + workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); } } // namespace raft diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index bb2b019ecb..41dc9cab08 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -31,40 +31,43 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) { +DI void sts(float* addr, const float& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); } -DI void sts(float* addr, const float (&x)[1]) { +DI void sts(float* addr, const float (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); } -DI void sts(float* addr, const float (&x)[2]) { +DI void sts(float* addr, const float (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" - : - : "l"(s2), "f"(x[0]), "f"(x[1])); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); } -DI void sts(float* addr, const float (&x)[4]) { +DI void sts(float* addr, const float (&x)[4]) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" : : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); } -DI void sts(double* addr, const double& x) { +DI void sts(double* addr, const double& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); } -DI void sts(double* addr, const double (&x)[1]) { +DI void sts(double* addr, const double (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double (&x)[2]) { +DI void sts(double* addr, const double (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" - : - : "l"(s2), "d"(x[0]), "d"(x[1])); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) { * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ -DI void lds(float& x, float* addr) { +DI void lds(float& x, float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); } -DI void lds(float (&x)[1], float* addr) { +DI void lds(float (&x)[1], float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); } -DI void lds(float (&x)[2], float* addr) { +DI void lds(float (&x)[2], float* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); } -DI void lds(float (&x)[4], float* addr) { +DI void lds(float (&x)[4], float* addr) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(s4)); } -DI void lds(double& x, double* addr) { +DI void lds(double& x, double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); } -DI void lds(double (&x)[1], double* addr) { +DI void lds(double (&x)[1], double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); } -DI void lds(double (&x)[2], double* addr) { +DI void lds(double (&x)[2], double* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); } /** @} */ @@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) { * @param[out] x data to be loaded from global memory * @param[in] addr address in global memory from where to load */ -DI void ldg(float& x, const float* addr) { +DI void ldg(float& x, const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); } -DI void ldg(float (&x)[1], const float* addr) { +DI void ldg(float (&x)[1], const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); } -DI void ldg(float (&x)[2], const float* addr) { - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(addr)); +DI void ldg(float (&x)[2], const float* addr) +{ + asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); } -DI void ldg(float (&x)[4], const float* addr) { +DI void ldg(float (&x)[4], const float* addr) +{ asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } -DI void ldg(double& x, const double* addr) { +DI void ldg(double& x, const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); } -DI void ldg(double (&x)[1], const double* addr) { +DI void ldg(double (&x)[1], const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); } -DI void ldg(double (&x)[2], const double* addr) { - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(addr)); +DI void ldg(double (&x)[2], const double* addr) +{ + asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); } /** @} */ diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index 785794461e..b228ac5499 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -22,8 +22,8 @@ namespace raft { template -__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, - IdxT len, Lambda op) { +__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +{ typedef TxN_t DataVec; typedef TxN_t IdxVec; IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); @@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, DataVec dataIn; #pragma unroll for (int i = 0; i < VecLen; ++i) { - auto inPos = idxIn.val.data[i]; + auto inPos = idxIn.val.data[i]; dataIn.val.data[i] = op(in[inPos], tid + i); } dataIn.store(out, tid); } template -void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - Lambda op, cudaStream_t stream) { +void scatterImpl( + DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream) +{ const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); - scatterKernel - <<>>(out, in, idx, len, op); + scatterKernel<<>>(out, in, idx, len, op); CUDA_CHECK(cudaGetLastError()); } /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. - */ -template , int TPB = 256> -void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - cudaStream_t stream, Lambda op = raft::Nop()) { + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT* out, + const DataT* in, + const IdxT* idx, + IdxT len, + cudaStream_t stream, + Lambda op = raft::Nop()) +{ if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; + size_t bytes = len * MaxPerElem; if (16 / MaxPerElem && bytes % 16 == 0) { - scatterImpl(out, in, idx, len, - op, stream); + scatterImpl(out, in, idx, len, op, stream); } else if (8 / MaxPerElem && bytes % 8 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (4 / MaxPerElem && bytes % 4 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (2 / MaxPerElem && bytes % 2 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (1 / MaxPerElem) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else { scatterImpl(out, in, idx, len, op, stream); } diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index bd8a4ce9e7..68b8e723e9 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -25,16 +25,7 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { - CHAR, - UINT8, - INT32, - UINT32, - INT64, - UINT64, - FLOAT32, - FLOAT64 -}; +enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -50,42 +41,50 @@ template constexpr datatype_t get_type(); template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::CHAR; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT8; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT64; } @@ -95,76 +94,106 @@ class comms_iface { virtual int get_rank() const = 0; virtual std::unique_ptr comm_split(int color, int key) const = 0; - virtual void barrier() const = 0; + virtual void barrier() const = 0; virtual status_t sync_stream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const = 0; + virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; - virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, + virtual void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; - virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + virtual void bcast( + void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void bcast(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, int root, + virtual void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, + virtual void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, cudaStream_t stream) const = 0; + virtual void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const = 0; - virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, + virtual void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const = 0; + + virtual void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, + virtual void gatherv(const void* sendbuf, + void* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reducescatter(const void* sendbuff, void* recvbuff, - size_t recvcount, datatype_t datatype, op_t op, + virtual void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const = 0; + virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const = 0; - - virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, + virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0; + + virtual void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, cudaStream_t stream) const = 0; - virtual void device_multicast_sendrecv( - const void* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - void* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const = 0; + virtual void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const = 0; }; class comms_t { public: - comms_t(std::unique_ptr impl) : impl_(impl.release()) { + comms_t(std::unique_ptr impl) : impl_(impl.release()) + { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } @@ -191,7 +220,8 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { return impl_->comm_split(color, key); } @@ -208,9 +238,7 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t sync_stream(cudaStream_t stream) const { - return impl_->sync_stream(stream); - } + status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); } /** * Performs an asynchronous point-to-point send @@ -223,10 +251,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void isend(const value_t* buf, size_t size, int dest, int tag, - request_t* request) const { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, - tag, request); + void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const + { + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } /** @@ -240,10 +267,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void irecv(value_t* buf, size_t size, int source, int tag, - request_t* request) const { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, - request); + void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const + { + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } /** @@ -251,7 +277,8 @@ class comms_t { * @param count number of requests to synchronize on * @param array_of_requests an array of request_t objects returned from isend/irecv */ - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { impl_->waitall(count, array_of_requests); } @@ -265,11 +292,15 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, - op_t op, cudaStream_t stream) const { + void allreduce( + const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const + { impl_->allreduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), - op, stream); + static_cast(recvbuff), + count, + get_type(), + op, + stream); } /** @@ -281,9 +312,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - impl_->bcast(static_cast(buff), count, get_type(), root, - stream); + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const + { + impl_->bcast(static_cast(buff), count, get_type(), root, stream); } /** @@ -296,10 +327,14 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root, - cudaStream_t stream) const { + void bcast( + const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const + { impl_->bcast(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), root, + static_cast(recvbuff), + count, + get_type(), + root, stream); } @@ -314,11 +349,20 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, - int root, cudaStream_t stream) const { + void reduce(const value_t* sendbuff, + value_t* recvbuff, + size_t count, + op_t op, + int root, + cudaStream_t stream) const + { impl_->reduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), op, - root, stream); + static_cast(recvbuff), + count, + get_type(), + op, + root, + stream); } /** @@ -330,11 +374,16 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - cudaStream_t stream) const { + void allgather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + cudaStream_t stream) const + { impl_->allgather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, - get_type(), stream); + static_cast(recvbuff), + sendcount, + get_type(), + stream); } /** @@ -349,12 +398,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgatherv(const value_t* sendbuf, value_t* recvbuf, - const size_t* recvcounts, const size_t* displs, - cudaStream_t stream) const { + void allgatherv(const value_t* sendbuf, + value_t* recvbuf, + const size_t* recvcounts, + const size_t* displs, + cudaStream_t stream) const + { impl_->allgatherv(static_cast(sendbuf), - static_cast(recvbuf), recvcounts, displs, - get_type(), stream); + static_cast(recvbuf), + recvcounts, + displs, + get_type(), + stream); } /** @@ -367,11 +422,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - int root, cudaStream_t stream) const { + void gather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + int root, + cudaStream_t stream) const + { impl_->gather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, get_type(), - root, stream); + static_cast(recvbuff), + sendcount, + get_type(), + root, + stream); } /** @@ -388,12 +450,22 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, int root, - cudaStream_t stream) const { + void gatherv(const value_t* sendbuf, + value_t* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + int root, + cudaStream_t stream) const + { impl_->gatherv(static_cast(sendbuf), - static_cast(recvbuf), sendcount, recvcounts, displs, - get_type(), root, stream); + static_cast(recvbuf), + sendcount, + recvcounts, + displs, + get_type(), + root, + stream); } /** @@ -406,11 +478,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reducescatter(const value_t* sendbuff, value_t* recvbuff, - size_t recvcount, op_t op, cudaStream_t stream) const { + void reducescatter(const value_t* sendbuff, + value_t* recvbuff, + size_t recvcount, + op_t op, + cudaStream_t stream) const + { impl_->reducescatter(static_cast(sendbuff), - static_cast(recvbuff), recvcount, - get_type(), op, stream); + static_cast(recvbuff), + recvcount, + get_type(), + op, + stream); } /** @@ -425,10 +504,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_send(const value_t* buf, size_t size, int dest, - cudaStream_t stream) const { - impl_->device_send(static_cast(buf), size * sizeof(value_t), - dest, stream); + void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const + { + impl_->device_send(static_cast(buf), size * sizeof(value_t), dest, stream); } /** @@ -443,10 +521,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_recv(value_t* buf, size_t size, int source, - cudaStream_t stream) const { - impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, - stream); + void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const + { + impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, stream); } /** @@ -462,12 +539,21 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest, - value_t* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { - impl_->device_sendrecv( - static_cast(sendbuf), sendsize * sizeof(value_t), dest, - static_cast(recvbuf), recvsize * sizeof(value_t), source, stream); + void device_sendrecv(const value_t* sendbuf, + size_t sendsize, + int dest, + value_t* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { + impl_->device_sendrecv(static_cast(sendbuf), + sendsize * sizeof(value_t), + dest, + static_cast(recvbuf), + recvsize * sizeof(value_t), + source, + stream); } /** @@ -485,28 +571,37 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_multicast_sendrecv( - const value_t* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - value_t* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { - auto sendbytesizes = sendsizes; + void device_multicast_sendrecv(const value_t* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + value_t* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { + auto sendbytesizes = sendsizes; auto sendbyteoffsets = sendoffsets; for (size_t i = 0; i < sendsizes.size(); ++i) { sendbytesizes[i] *= sizeof(value_t); sendbyteoffsets[i] *= sizeof(value_t); } - auto recvbytesizes = recvsizes; + auto recvbytesizes = recvsizes; auto recvbyteoffsets = recvoffsets; for (size_t i = 0; i < recvsizes.size(); ++i) { recvbytesizes[i] *= sizeof(value_t); recvbyteoffsets[i] *= sizeof(value_t); } impl_->device_multicast_sendrecv(static_cast(sendbuf), - sendbytesizes, sendbyteoffsets, dests, - static_cast(recvbuf), recvbytesizes, - recvbyteoffsets, sources, stream); + sendbytesizes, + sendbyteoffsets, + dests, + static_cast(recvbuf), + recvbytesizes, + recvbyteoffsets, + sources, + stream); } private: diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index e01490d728..2be5b0d23f 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -36,12 +36,12 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, - int num_ranks, int rank) { +void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank) +{ cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); + auto communicator = std::make_shared( + std::unique_ptr(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); handle->set_comms(communicator); } @@ -60,20 +60,20 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, - void *ucp_worker, void *eps, int num_ranks, - int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); +void build_comms_nccl_ucx( + handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank) +{ + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; + ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } @@ -81,18 +81,19 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared( - std::unique_ptr(new raft::comms::std_comms( + auto communicator = + std::make_shared(std::unique_ptr(new raft::comms::std_comms( nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream))); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, - int size) { +inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size) +{ memcpy(id->internal, uniqueId, size); } -inline void get_unique_id(char *uid, int size) { +inline void get_unique_id(char* uid, int size) +{ ncclUniqueId id; ncclGetUniqueId(&id); diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 067c7bd0ab..3091cd53a9 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,16 +32,16 @@ #include #include -#define MPI_TRY(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ - #call, mpi_error_string); \ - } \ +#define MPI_TRY(call) \ + do { \ + int status = call; \ + if (MPI_SUCCESS != status) { \ + int mpi_error_string_lenght = 0; \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ + RAFT_EXPECTS( \ + MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \ + } \ } while (0) #define MPI_TRY_NO_THROW(call) \ @@ -51,48 +51,41 @@ int mpi_error_string_lenght = 0; \ char mpi_error_string[MPI_MAX_ERROR_STRING]; \ MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - printf("MPI call='%s' at file=%s line=%d failed with %s ", #call, \ - __FILE__, __LINE__, mpi_error_string); \ + printf("MPI call='%s' at file=%s line=%d failed with %s ", \ + #call, \ + __FILE__, \ + __LINE__, \ + mpi_error_string); \ } \ } while (0) namespace raft { namespace comms { -constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) { +constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return MPI_CHAR; - case datatype_t::UINT8: - return MPI_UNSIGNED_CHAR; - case datatype_t::INT32: - return MPI_INT; - case datatype_t::UINT32: - return MPI_UNSIGNED; - case datatype_t::INT64: - return MPI_LONG_LONG; - case datatype_t::UINT64: - return MPI_UNSIGNED_LONG_LONG; - case datatype_t::FLOAT32: - return MPI_FLOAT; - case datatype_t::FLOAT64: - return MPI_DOUBLE; + case datatype_t::CHAR: return MPI_CHAR; + case datatype_t::UINT8: return MPI_UNSIGNED_CHAR; + case datatype_t::INT32: return MPI_INT; + case datatype_t::UINT32: return MPI_UNSIGNED; + case datatype_t::INT64: return MPI_LONG_LONG; + case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG; + case datatype_t::FLOAT32: return MPI_FLOAT; + case datatype_t::FLOAT64: return MPI_DOUBLE; default: // Execution should never reach here. This takes care of compiler warning. return MPI_DOUBLE; } } -constexpr MPI_Op get_mpi_op(const op_t op) { +constexpr MPI_Op get_mpi_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return MPI_SUM; - case op_t::PROD: - return MPI_PROD; - case op_t::MIN: - return MPI_MIN; - case op_t::MAX: - return MPI_MAX; + case op_t::SUM: return MPI_SUM; + case op_t::PROD: return MPI_PROD; + case op_t::MIN: return MPI_MIN; + case op_t::MAX: return MPI_MAX; default: // Execution should never reach here. This takes care of compiler warning. return MPI_MAX; @@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) { class mpi_comms : public comms_iface { public: mpi_comms(MPI_Comm comm, const bool owns_mpi_comm) - : owns_mpi_comm_(owns_mpi_comm), - mpi_comm_(comm), - size_(0), - rank_(1), - next_request_id_(0) { + : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) + { int mpi_is_initialized = 0; MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); - //get NCCL unique ID at rank 0 and broadcast it to all others + // get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); - //initializing NCCL + // initializing NCCL NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } - virtual ~mpi_comms() { - //finalizing NCCL + virtual ~mpi_comms() + { + // finalizing NCCL NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { - MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); - } + if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } } int get_size() const { return size_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { MPI_Comm new_comm; MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); @@ -141,15 +131,15 @@ class mpi_comms : public comms_iface { void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } - void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const { + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); @@ -157,15 +147,15 @@ class mpi_comms : public comms_iface { *request = req_id; } - void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const { + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } @@ -174,7 +164,8 @@ class mpi_comms : public comms_iface { *request = req_id; } - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { std::vector requests; requests.reserve(count); for (int i = 0; i < count; ++i) { @@ -189,94 +180,149 @@ class mpi_comms : public comms_iface { MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } - void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void bcast(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclBroadcast( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts, - const size_t* displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + - displs[root] * get_datatype_size(datatype), - recvcounts[root], get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY( + ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], get_nccl_datatype(datatype), r, - nccl_comm_, stream)); + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -309,45 +355,58 @@ class mpi_comms : public comms_iface { }; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, std::vector const& sendsizes, std::vector const& sendoffsets, - std::vector const& dests, void* recvbuf, + std::vector const& dests, + void* recvbuf, std::vector const& recvsizes, std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -365,9 +424,10 @@ class mpi_comms : public comms_iface { mutable std::unordered_set free_requests_; }; -inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) { - auto communicator = std::make_shared( - std::unique_ptr(new mpi_comms(comm, true))); +inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) +{ + auto communicator = + std::make_shared(std::unique_ptr(new mpi_comms(comm, true))); handle->set_comms(communicator); }; diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 47559b1718..1647c29667 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,9 +64,13 @@ class std_comms : public comms_iface { * @param stream cuda stream for synchronizing and ordering collective operations * @param subcomms_ucp use ucp for subcommunicators */ - std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int num_ranks, int rank, - cudaStream_t stream, bool subcomms_ucp = true) + std_comms(ncclComm_t nccl_comm, + ucp_worker_h ucp_worker, + std::shared_ptr eps, + int num_ranks, + int rank, + cudaStream_t stream, + bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), status_(2, stream), @@ -75,7 +79,8 @@ class std_comms : public comms_iface { subcomms_ucp_(subcomms_ucp), ucp_worker_(ucp_worker), ucp_eps_(eps), - next_request_id_(0) { + next_request_id_(0) + { initialize(); }; @@ -86,18 +91,19 @@ class std_comms : public comms_iface { * @param rank rank of the current worker * @param stream stream for ordering collective operations */ - std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - cudaStream_t stream) + std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream) : nccl_comm_(nccl_comm), stream_(stream), status_(2, stream), num_ranks_(num_ranks), rank_(rank), - subcomms_ucp_(false) { + subcomms_ucp_(false) + { initialize(); }; - void initialize() { + void initialize() + { sendbuff_ = status_.data(); recvbuff_ = status_.data() + 1; } @@ -106,17 +112,16 @@ class std_comms : public comms_iface { int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { rmm::device_uvector d_colors(get_size(), stream_); rmm::device_uvector d_keys(get_size(), stream_); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); - allgather(d_colors.data() + get_rank(), d_colors.data(), 1, - datatype_t::INT32, stream_); - allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, - stream_); + allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_); + allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_); this->sync_stream(stream_); std::vector h_colors(get_size()); @@ -133,9 +138,7 @@ class std_comms : public comms_iface { for (int i = 0; i < get_size(); ++i) { if (h_colors[i] == color) { subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { - new_ucx_ptrs.push_back((*ucp_eps_)[i]); - } + if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } } } @@ -144,8 +147,7 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, - requests.data() + (i - 1)); + isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); } waitall(requests.size(), requests.data()); } else { @@ -160,17 +162,22 @@ class std_comms : public comms_iface { NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr( - new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, - subcomm_ranks.size(), key, stream_, subcomms_ucp_)); + auto eps_sp = std::make_shared(new_ucx_ptrs.data()); + return std::unique_ptr(new std_comms(nccl_comm, + (ucp_worker_h)ucp_worker_, + eps_sp, + subcomm_ranks.size(), + key, + stream_, + subcomms_ucp_)); } else { return std::unique_ptr( new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); } } - void barrier() const { + void barrier() const + { CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); @@ -180,39 +187,37 @@ class std_comms : public comms_iface { "ERROR: syncStream failed. This can be caused by a failed rank_."); } - void get_request_id(request_t *req) const { + void get_request_id(request_t* req) const + { request_t req_id; if (this->free_requests_.empty()) req_id = this->next_request_id_++; else { auto it = this->free_requests_.begin(); - req_id = *it; + req_id = *it; this->free_requests_.erase(it); } *req = req_id; } - void isend(const void *buf, size_t size, int dest, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); - this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, get_rank()); + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void *buf, size_t size, int source, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); @@ -220,18 +225,17 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, - tag_mask, source); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, request_t array_of_requests[]) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void waitall(int count, request_t array_of_requests[]) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); - std::vector requests; + std::vector requests; requests.reserve(count); time_t start = time(NULL); @@ -239,7 +243,8 @@ class std_comms : public comms_iface { for (int i = 0; i < count; ++i) { auto req_it = requests_in_flight_.find(array_of_requests[i]); ASSERT(requests_in_flight_.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); + "ERROR: waitall on invalid request: %d", + array_of_requests[i]); requests.push_back(req_it->second); free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); @@ -252,8 +257,7 @@ class std_comms : public comms_iface { // in 10 or more seconds. ASSERT(now - start < 10, "Timed out waiting for requests."); - for (std::vector::iterator it = requests.begin(); - it != requests.end();) { + for (std::vector::iterator it = requests.begin(); it != requests.end();) { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue @@ -266,10 +270,8 @@ class std_comms : public comms_iface { // If the message needs release, we know it will be sent/received // asynchronously, so we will need to track and verify its state if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), - "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", - UCS_PTR_STATUS(req->req)); + ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req)); ASSERT(req->req->completed == 1 || req->req->completed == 0, "request->completed not a valid value: %d\n", req->req->completed); @@ -290,101 +292,154 @@ class std_comms : public comms_iface { ++it; } // if any progress was made, reset the timeout start time - if (restart) { - start = time(NULL); - } + if (restart) { start = time(NULL); } } } } - void allreduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void *buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void bcast(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclBroadcast( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts, - const size_t *displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast( - sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, - const size_t *recvcounts, const size_t *displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -417,45 +472,58 @@ class std_comms : public comms_iface { } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void *buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void *buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, - void *recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void device_multicast_sendrecv(const void *sendbuf, - std::vector const &sendsizes, - std::vector const &sendoffsets, - std::vector const &dests, void *recvbuf, - std::vector const &recvsizes, - std::vector const &recvoffsets, - std::vector const &sources, - cudaStream_t stream) const { + void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -475,10 +543,9 @@ class std_comms : public comms_iface { comms_ucp_handler ucp_handler_; ucp_worker_h ucp_worker_; - std::shared_ptr ucp_eps_; + std::shared_ptr ucp_eps_; mutable request_t next_request_id_; - mutable std::unordered_map - requests_in_flight_; + mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; }; } // end namespace comms diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 39086de25d..5f87bf41fa 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -35,24 +35,23 @@ namespace comms { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_allreduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = 1; cudaStream_t stream = handle.get_stream(); rmm::device_scalar temp_d(stream); - CUDA_CHECK( - cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK( - cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -67,10 +66,11 @@ bool test_collective_allreduce(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_broadcast(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_broadcast(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -79,14 +79,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -101,10 +99,11 @@ bool test_collective_broadcast(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_reduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -112,14 +111,12 @@ bool test_collective_reduce(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -137,10 +134,11 @@ bool test_collective_reduce(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_allgather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allgather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -149,16 +147,13 @@ bool test_collective_allgather(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); rmm::device_uvector recv_d(communicator.get_size(), stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); - int - temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), - sizeof(int) * communicator.get_size(), - cudaMemcpyDeviceToHost, stream)); + int temp_h[communicator.get_size()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync( + &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -176,30 +171,29 @@ bool test_collective_allgather(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_gather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); cudaStream_t stream = handle.get_stream(); rmm::device_scalar temp_d(stream); - rmm::device_uvector recv_d( - communicator.get_rank() == root ? communicator.get_size() : 0, stream); + rmm::device_uvector recv_d(communicator.get_rank() == root ? communicator.get_size() : 0, + stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), - sizeof(int) * temp_h.size(), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -214,45 +208,47 @@ bool test_collective_gather(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_gatherv(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gatherv(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sendcounts(communicator.get_size()); std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); std::vector displacements(communicator.get_size() + 1, 0); - std::partial_sum(sendcounts.begin(), sendcounts.end(), - displacements.begin() + 1); + std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1); - std::vector sends(displacements[communicator.get_rank() + 1] - - displacements[communicator.get_rank()], - communicator.get_rank()); + std::vector sends( + displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()], + communicator.get_rank()); cudaStream_t stream = handle.get_stream(); rmm::device_uvector temp_d(sends.size(), stream); - rmm::device_uvector recv_d( - communicator.get_rank() == root ? displacements.back() : 0, stream); + rmm::device_uvector recv_d(communicator.get_rank() == root ? displacements.back() : 0, + stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gatherv( - temp_d.data(), recv_d.data(), temp_d.size(), - communicator.get_rank() == root ? sendcounts.data() - : static_cast(nullptr), - communicator.get_rank() == root ? displacements.data() - : static_cast(nullptr), - root, stream); + temp_d.data(), + recv_d.data(), + temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() : static_cast(nullptr), + root, + stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), + recv_d.data(), sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, stream)); + cudaMemcpyDeviceToHost, + stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -271,10 +267,11 @@ bool test_collective_gatherv(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_reducescatter(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reducescatter(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sends(communicator.get_size(), 1); @@ -283,16 +280,13 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { rmm::device_uvector temp_d(sends.size(), stream); rmm::device_scalar recv_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, - stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -309,9 +303,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { * initialized comms instance. * @param[in] numTrials number of iterations of all-to-all messaging to perform */ -bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); +bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -320,11 +315,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { std::vector requests; requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; - //post receives + // post receives for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, - requests.data() + request_idx); + communicator.irecv( + received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); ++request_idx; } } @@ -360,8 +355,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.barrier(); } - if (communicator.get_rank() == 0) - std::cout << "=========================" << std::endl; + if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; } return ret; @@ -374,10 +368,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -400,13 +395,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { communicator.sync_stream(stream); - if (!sender && received_data.value(stream) != rank - 1) { - ret = false; - } + if (!sender && received_data.value(stream) != rank - 1) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -419,10 +410,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -436,12 +428,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { if (rank % 2 == 0) { if (rank + 1 < communicator.get_size()) { - communicator.device_sendrecv(sent_data.data(), 1, rank + 1, - received_data.data(), 1, rank + 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream); } } else { - communicator.device_sendrecv(sent_data.data(), 1, rank - 1, - received_data.data(), 1, rank - 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream); } communicator.sync_stream(stream); @@ -451,9 +443,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -466,11 +456,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, - int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -493,25 +483,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, std::vector srcs(communicator.get_size()); std::iota(srcs.begin(), srcs.end(), int{0}); - communicator.device_multicast_sendrecv( - sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(), - recvsizes, recvoffsets, srcs, stream); + communicator.device_multicast_sendrecv(sent_data.data(), + sendsizes, + sendoffsets, + dests, + received_data.data(), + recvsizes, + recvoffsets, + srcs, + stream); communicator.sync_stream(stream); std::vector h_received_data(communicator.get_size()); - raft::update_host(h_received_data.data(), received_data.data(), - received_data.size(), stream); + raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream); CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); ++i) { - if (h_received_data[i] != i) { - ret = false; - } + if (h_received_data[i] != i) { ret = false; } } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -524,20 +515,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, * initialized comms instance. * @param n_colors number of different colors to test */ -bool test_commsplit(const handle_t &h, int n_colors) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - int const size = communicator.get_size(); +bool test_commsplit(const handle_t& h, int n_colors) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + int const size = communicator.get_size(); if (n_colors > size) n_colors = size; // first we need to assign to a color, then assign the rank within the color int color = rank % n_colors; - int key = rank / n_colors; + int key = rank / n_colors; handle_t new_handle(1); - auto shared_comm = - std::make_shared(communicator.comm_split(color, key)); + auto shared_comm = std::make_shared(communicator.comm_split(color, key)); new_handle.set_comms(shared_comm); return test_collective_allreduce(new_handle, 0); diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 226b6f0527..89c7b25630 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -25,16 +25,19 @@ namespace raft { namespace comms { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); -typedef void (*dlsym_rec_free)(void *); +typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); +typedef void (*dlsym_rec_free)(void*); typedef int (*dlsym_worker_progress)(ucp_worker_h); -typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, - ucp_datatype_t, ucp_tag_t, - ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, - ucp_datatype_t datatype, ucp_tag_t, - ucp_tag_t, ucp_tag_recv_callback_t); +typedef ucs_status_ptr_t (*dlsym_send)( + ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, + void*, + size_t count, + ucp_datatype_t datatype, + ucp_tag_t, + ucp_tag_t, + ucp_tag_recv_callback_t); /** * Standard UCX request object that will be passed @@ -55,9 +58,9 @@ struct ucx_context { */ class ucp_request { public: - struct ucx_context *req; - bool needs_release = true; - int other_rank = -1; + struct ucx_context* req; + bool needs_release = true; + int other_rank = -1; bool is_send_request = false; }; @@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1; /** * @brief Asynchronous send callback sets request to completed */ -static void send_callback(void *request, ucs_status_t status) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void send_callback(void* request, ucs_status_t status) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** * @brief Asynchronous recv callback sets request to completed */ -static void recv_callback(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** @@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status, */ class comms_ucp_handler { public: - comms_ucp_handler() { + comms_ucp_handler() + { load_ucp_handle(); load_send_func(); load_recv_func(); @@ -99,7 +104,7 @@ class comms_ucp_handler { ~comms_ucp_handler() { dlclose(ucp_handle); } private: - void *ucp_handle; + void* ucp_handle; dlsym_print_info print_info_func; dlsym_rec_free req_free_func; @@ -107,7 +112,8 @@ class comms_ucp_handler { dlsym_send send_func; dlsym_recv recv_func; - void load_ucp_handle() { + void load_ucp_handle() + { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); if (!ucp_handle) { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); @@ -117,51 +123,56 @@ class comms_ucp_handler { dlerror(); } - void assert_dlerror() { - char *error = dlerror(); + void assert_dlerror() + { + char* error = dlerror(); ASSERT(error == NULL, "Error loading function symbol: %s\n", error); } - void load_send_func() { + void load_send_func() + { send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); assert_dlerror(); } - void load_free_req_func() { + void load_free_req_func() + { req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); assert_dlerror(); } - void load_print_info_func() { + void load_print_info_func() + { print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); assert_dlerror(); } - void load_worker_progress_func() { - worker_progress_func = - (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + void load_worker_progress_func() + { + worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); assert_dlerror(); } - void load_recv_func() { + void load_recv_func() + { recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); assert_dlerror(); } - ucp_tag_t build_message_tag(int rank, int tag) const { + ucp_tag_t build_message_tag(int rank, int tag) const + { // keeping the rank in the lower bits enables debugging. return ((uint32_t)tag << 31) | (uint32_t)rank; } public: - int ucp_progress(ucp_worker_h worker) const { - return (*(worker_progress_func))(worker); - } + int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } /** * @brief Frees any memory underlying the given ucp request object */ - void free_ucp_request(ucp_request *request) const { + void free_ucp_request(ucp_request* request) const + { if (request->needs_release) { request->req->completed = 0; (*(req_free_func))(request->req); @@ -172,56 +183,67 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, - size_t size, int tag, ucp_tag_t tag_mask, int rank) const { + void ucp_isend(ucp_request* req, + ucp_ep_h ep_ptr, + const void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int rank) const + { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - ucs_status_ptr_t send_result = (*(send_func))( - ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context *ucp_req = (struct ucx_context *)send_result; + ucs_status_ptr_t send_result = + (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context* ucp_req = (struct ucx_context*)send_result; if (UCS_PTR_IS_ERR(send_result)) { ASSERT(!UCS_PTR_IS_ERR(send_result), "unable to send UCX data message (%d)\n", UCS_PTR_STATUS(send_result)); /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ req->needs_release = true; } else { req->needs_release = false; } - req->other_rank = rank; + req->other_rank = rank; req->is_send_request = true; - req->req = ucp_req; + req->req = ucp_req; } /** * @brief Asynchronously receive data from given endpoint with the given tag. */ - void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, size_t size, int tag, ucp_tag_t tag_mask, - int sender_rank) const { + void ucp_irecv(ucp_request* req, + ucp_worker_h worker, + ucp_ep_h ep_ptr, + void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int sender_rank) const + { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, - tag_mask, recv_callback); + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); - struct ucx_context *ucp_req = (struct ucx_context *)recv_result; + struct ucx_context* ucp_req = (struct ucx_context*)recv_result; - req->req = ucp_req; - req->needs_release = true; + req->req = ucp_req; + req->needs_release = true; req->is_send_request = false; - req->other_rank = sender_rank; + req->other_rank = sender_rank; ASSERT(!UCS_PTR_IS_ERR(recv_result), "unable to receive UCX data message (%d)\n", diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index f3216abc37..1b0548fc00 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,88 +26,70 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, ncclGetErrorString(status)); \ - throw raft::logic_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + ncclGetErrorString(status)); \ + throw raft::logic_error(msg); \ + } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (ncclSuccess != status) { \ - printf("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ +#define NCCL_TRY_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \ + } \ } while (0) namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { +constexpr size_t get_datatype_size(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return sizeof(char); - case datatype_t::UINT8: - return sizeof(uint8_t); - case datatype_t::INT32: - return sizeof(int); - case datatype_t::UINT32: - return sizeof(unsigned int); - case datatype_t::INT64: - return sizeof(int64_t); - case datatype_t::UINT64: - return sizeof(uint64_t); - case datatype_t::FLOAT32: - return sizeof(float); - case datatype_t::FLOAT64: - return sizeof(double); - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return sizeof(char); + case datatype_t::UINT8: return sizeof(uint8_t); + case datatype_t::INT32: return sizeof(int); + case datatype_t::UINT32: return sizeof(unsigned int); + case datatype_t::INT64: return sizeof(int64_t); + case datatype_t::UINT64: return sizeof(uint64_t); + case datatype_t::FLOAT32: return sizeof(float); + case datatype_t::FLOAT64: return sizeof(double); + default: throw "Unsupported datatype"; } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return ncclChar; - case datatype_t::UINT8: - return ncclUint8; - case datatype_t::INT32: - return ncclInt; - case datatype_t::UINT32: - return ncclUint32; - case datatype_t::INT64: - return ncclInt64; - case datatype_t::UINT64: - return ncclUint64; - case datatype_t::FLOAT32: - return ncclFloat; - case datatype_t::FLOAT64: - return ncclDouble; - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return ncclChar; + case datatype_t::UINT8: return ncclUint8; + case datatype_t::INT32: return ncclInt; + case datatype_t::UINT32: return ncclUint32; + case datatype_t::INT64: return ncclInt64; + case datatype_t::UINT64: return ncclUint64; + case datatype_t::FLOAT32: return ncclFloat; + case datatype_t::FLOAT64: return ncclDouble; + default: throw "Unsupported datatype"; } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return ncclSum; - case op_t::PROD: - return ncclProd; - case op_t::MIN: - return ncclMin; - case op_t::MAX: - return ncclMax; - default: - throw "Unsupported datatype"; + case op_t::SUM: return ncclSum; + case op_t::PROD: return ncclProd; + case op_t::MIN: return ncclMin; + case op_t::MAX: return ncclMax; + default: throw "Unsupported datatype"; } } }; // namespace comms diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 14274043f5..8a66eff242 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -36,16 +36,17 @@ namespace raft { /** helper macro for device inlined functions */ -#define DI inline __device__ +#define DI inline __device__ #define HDI inline __host__ __device__ -#define HD __host__ __device__ +#define HD __host__ __device__ /** * @brief Provide a ceiling division operation ie. ceil(a / b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType ceildiv(IntType a, IntType b) { +constexpr HDI IntType ceildiv(IntType a, IntType b) +{ return (a + b - 1) / b; } @@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignTo(IntType a, IntType b) { +constexpr HDI IntType alignTo(IntType a, IntType b) +{ return ceildiv(a, b) * b; } @@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignDown(IntType a, IntType b) { +constexpr HDI IntType alignDown(IntType a, IntType b) +{ return (a / b) * b; } @@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI bool isPo2(IntType num) { +constexpr HDI bool isPo2(IntType num) +{ return (num && !(num & (num - 1))); } @@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) +{ return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); } /** Device function to apply the input lambda across threads in the grid */ template -DI void forEach(int num, L lambda) { - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; +DI void forEach(int num, L lambda) +{ + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; #pragma unroll for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { @@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) { static const int WarpSize = 32; /** get the laneId of the current thread */ -DI int laneId() { +DI int laneId() +{ int id; asm("mov.s32 %0, %laneid;" : "=r"(id)); return id; @@ -113,15 +120,17 @@ DI int laneId() { * @param b second input */ template -HDI void swapVals(T &a, T &b) { +HDI void swapVals(T& a, T& b) +{ T tmp = a; - a = b; - b = tmp; + a = b; + b = tmp; } /** Device function to have atomic add support for older archs */ template -DI void myAtomicAdd(Type *address, Type val) { +DI void myAtomicAdd(Type* address, Type val) +{ atomicAdd(address, val); } @@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) { // Ref: // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf template <> -DI void myAtomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + old = + atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); } #endif template -DI void myAtomicReduce(T *address, T val, ReduceLambda op); +DI void myAtomicReduce(T* address, T val, ReduceLambda op); template -DI void myAtomicReduce(double *address, double val, ReduceLambda op) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicReduce(double* address, double val, ReduceLambda op) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, - __double_as_longlong(op(val, __longlong_as_double(assumed)))); + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(float *address, float val, ReduceLambda op) { - unsigned int *address_as_uint = (unsigned int *)address; - unsigned int old = *address_as_uint, assumed; +DI void myAtomicReduce(float* address, float val, ReduceLambda op) +{ + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, - __float_as_uint(op(val, __uint_as_float(assumed)))); + old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(int *address, int val, ReduceLambda op) { +DI void myAtomicReduce(int* address, int val, ReduceLambda op) +{ int old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { +DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) +{ long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(unsigned long long *address, unsigned long long val, - ReduceLambda op) { +DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) +{ unsigned long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val); +DI T myAtomicMin(T* address, T val); /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val); +DI T myAtomicMax(T* address, T val); -DI float myAtomicMin(float *address, float val) { +DI float myAtomicMin(float* address, float val) +{ myAtomicReduce(address, val, fminf); return *address; } -DI float myAtomicMax(float *address, float val) { +DI float myAtomicMax(float* address, float val) +{ myAtomicReduce(address, val, fmaxf); return *address; } -DI double myAtomicMin(double *address, double val) { +DI double myAtomicMin(double* address, double val) +{ myAtomicReduce(address, val, fmin); return *address; } -DI double myAtomicMax(double *address, double val) { +DI double myAtomicMax(double* address, double val) +{ myAtomicReduce(address, val, fmax); return *address; } @@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) { template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) { +HDI float myMax(float x, float y) +{ return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) { +HDI double myMax(double x, double y) +{ return fmax(x, y); } /** @} */ @@ -255,11 +275,13 @@ HDI double myMax(double x, double y) { template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) { +HDI float myMin(float x, float y) +{ return fminf(x, y); } template <> -HDI double myMin(double x, double y) { +HDI double myMin(double x, double y) +{ return fmin(x, y); } /** @} */ @@ -267,11 +289,13 @@ HDI double myMin(double x, double y) { /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val) { +DI T myAtomicMin(T* address, T val) +{ myAtomicReduce(address, val, myMin); return *address; } @@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) { /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val) { +DI T myAtomicMax(T* address, T val) +{ myAtomicReduce(address, val, myMax); return *address; } @@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) { * Sign function */ template -HDI int sgn(const T val) { +HDI int sgn(const T val) +{ return (T(0) < val) - (val < T(0)); } @@ -303,11 +330,13 @@ HDI int sgn(const T val) { template HDI T myExp(T x); template <> -HDI float myExp(float x) { +HDI float myExp(float x) +{ return expf(x); } template <> -HDI double myExp(double x) { +HDI double myExp(double x) +{ return exp(x); } /** @} */ @@ -319,11 +348,13 @@ HDI double myExp(double x) { template inline __device__ T myInf(); template <> -inline __device__ float myInf() { +inline __device__ float myInf() +{ return CUDART_INF_F; } template <> -inline __device__ double myInf() { +inline __device__ double myInf() +{ return CUDART_INF; } /** @} */ @@ -335,11 +366,13 @@ inline __device__ double myInf() { template HDI T myLog(T x); template <> -HDI float myLog(float x) { +HDI float myLog(float x) +{ return logf(x); } template <> -HDI double myLog(double x) { +HDI double myLog(double x) +{ return log(x); } /** @} */ @@ -351,11 +384,13 @@ HDI double myLog(double x) { template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) { +HDI float mySqrt(float x) +{ return sqrtf(x); } template <> -HDI double mySqrt(double x) { +HDI double mySqrt(double x) +{ return sqrt(x); } /** @} */ @@ -365,13 +400,15 @@ HDI double mySqrt(double x) { * @{ */ template -DI void mySinCos(T x, T &s, T &c); +DI void mySinCos(T x, T& s, T& c); template <> -DI void mySinCos(float x, float &s, float &c) { +DI void mySinCos(float x, float& s, float& c) +{ sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double &s, double &c) { +DI void mySinCos(double x, double& s, double& c) +{ sincos(x, &s, &c); } /** @} */ @@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) { template DI T mySin(T x); template <> -DI float mySin(float x) { +DI float mySin(float x) +{ return sinf(x); } template <> -DI double mySin(double x) { +DI double mySin(double x) +{ return sin(x); } /** @} */ @@ -397,15 +436,18 @@ DI double mySin(double x) { * @{ */ template -DI T myAbs(T x) { +DI T myAbs(T x) +{ return x < 0 ? -x : x; } template <> -DI float myAbs(float x) { +DI float myAbs(float x) +{ return fabsf(x); } template <> -DI double myAbs(double x) { +DI double myAbs(double x) +{ return fabs(x); } /** @} */ @@ -417,11 +459,13 @@ DI double myAbs(double x) { template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) { +HDI float myPow(float x, float power) +{ return powf(x, power); } template <> -HDI double myPow(double x, double power) { +HDI double myPow(double x, double power) +{ return pow(x, power); } /** @} */ @@ -433,11 +477,13 @@ HDI double myPow(double x, double power) { template HDI T myTanh(T x); template <> -HDI float myTanh(float x) { +HDI float myTanh(float x) +{ return tanhf(x); } template <> -HDI double myTanh(double x) { +HDI double myTanh(double x) +{ return tanh(x); } /** @} */ @@ -449,11 +495,13 @@ HDI double myTanh(double x) { template HDI T myATanh(T x); template <> -HDI float myATanh(float x) { +HDI float myATanh(float x) +{ return atanhf(x); } template <> -HDI double myATanh(double x) { +HDI double myATanh(double x) +{ return atanh(x); } /** @} */ @@ -492,15 +540,18 @@ struct Sum { * @{ */ template -DI T signPrim(T x) { +DI T signPrim(T x) +{ return x < 0 ? -1 : +1; } template <> -DI float signPrim(float x) { +DI float signPrim(float x) +{ return signbit(x) == true ? -1.0f : +1.0f; } template <> -DI double signPrim(double x) { +DI double signPrim(double x) +{ return signbit(x) == true ? -1.0 : +1.0; } /** @} */ @@ -514,28 +565,33 @@ DI double signPrim(double x) { * @{ */ template -DI T maxPrim(T x, T y) { +DI T maxPrim(T x, T y) +{ return x > y ? x : y; } template <> -DI float maxPrim(float x, float y) { +DI float maxPrim(float x, float y) +{ return fmaxf(x, y); } template <> -DI double maxPrim(double x, double y) { +DI double maxPrim(double x, double y) +{ return fmax(x, y); } /** @} */ /** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() { +DI void warpFence() +{ #if __CUDA_ARCH__ >= 700 __syncwarp(); #endif } /** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __any_sync(mask, inFlag); #else @@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { } /** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __all_sync(mask, inFlag); #else @@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { * @return the shuffled data */ template -DI T shfl(T val, int srcLane, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_sync(mask, val, srcLane, width); #else @@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, * @return the shuffled data */ template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, val, laneMask, width); #else @@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, * @todo Expand this to support arbitrary reduction ops */ template -DI T warpReduce(T val) { +DI T warpReduce(T val) +{ #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { T tmp = shfl(val, laneId() + i); @@ -623,12 +681,13 @@ DI T warpReduce(T val) { * @todo Expand this to support arbitrary reduction ops */ template -DI T blockReduce(T val, char *smem) { - auto *sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); +DI T blockReduce(T val, char* smem) +{ + auto* sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); if (lid == 0) sTemp[wid] = val; __syncthreads(); val = lid < nWarps ? sTemp[lid] : T(0); @@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) { * @param idx the index for which to query the stream */ inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t *int_streams, int n_int_streams, - int idx) { + cudaStream_t* int_streams, + int n_int_streams, + int idx) +{ return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; } diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 486103dedb..cf06416a96 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -54,17 +54,20 @@ struct cuda_error : public raft::exception { * */ #ifndef CUDA_TRY -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ - cudaGetErrorName(status), cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "CUDA error encountered at: ", \ + "call='%s', Reason=%s:%s", \ + #call, \ + cudaGetErrorName(status), \ + cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ } while (0) #endif /** @@ -97,13 +100,16 @@ struct cuda_error : public raft::exception { // * exception. // */ #ifndef CUDA_CHECK_NO_THROW -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ - __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(status)); \ + } \ } while (0) #endif @@ -112,7 +118,7 @@ struct cuda_error : public raft::exception { * TODO: Rename original implementations in 22.04 to fix * https://github.com/rapidsai/raft/issues/128 */ -#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) +#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) #define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call) namespace raft { @@ -120,9 +126,7 @@ namespace raft { /** Helper method to get to know warp size in device code */ __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; -} +__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping @@ -134,20 +138,23 @@ class grid_1d_thread_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param elements_per_thread Typically, a single kernel thread processes more than a single - * element; this affects the number of threads the grid must contain - */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + * @param elements_per_thread Typically, a single kernel thread processes more than a single + * element; this affects the number of threads the grid must contain + */ + grid_1d_thread_t(size_t overall_num_elements, + size_t num_threads_per_block, + size_t max_num_blocks_1d, + size_t elements_per_thread = 1) : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { + num_blocks( + std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -165,18 +172,19 @@ class grid_1d_warp_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_warp_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { + num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -193,15 +201,17 @@ class grid_1d_block_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_block_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -217,10 +227,9 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream) +{ + CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -231,23 +240,22 @@ void copy(Type* dst, const Type* src, size_t len, */ /** performs a host to device copy */ template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - rmm::cuda_stream_view stream) { +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream) +{ copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - rmm::cuda_stream_view stream) { +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream) +{ copy(h_ptr, d_ptr, len, stream); } template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream) +{ + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -256,8 +264,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, * @{ */ template -void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { +void print_host_vector(const char* variable_name, + const T* host_mem, + size_t componentsCount, + OutStream& out) +{ out << variable_name << "=["; for (size_t i = 0; i < componentsCount; ++i) { if (i != 0) out << ","; @@ -267,11 +278,13 @@ void print_host_vector(const char* variable_name, const T* host_mem, } template -void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { +void print_device_vector(const char* variable_name, + const T* devMem, + size_t componentsCount, + OutStream& out) +{ T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); print_host_vector(variable_name, host_mem, componentsCount, out); delete[] host_mem; } @@ -281,10 +294,10 @@ static std::mutex mutex_; static std::unordered_map allocations; template -void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, - bool setZero = false) { +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false) +{ size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); std::lock_guard _(mutex_); @@ -292,17 +305,19 @@ void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, } template -void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) +{ std::lock_guard _(mutex_); size_t size = allocations[ptr]; allocations.erase(ptr); rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); } -inline void deallocate_all(rmm::cuda_stream_view stream) { +inline void deallocate_all(rmm::cuda_stream_view stream) +{ std::lock_guard _(mutex_); for (auto& alloc : allocations) { - void* ptr = alloc.first; + void* ptr = alloc.first; size_t size = alloc.second; rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); } @@ -310,29 +325,29 @@ inline void deallocate_all(rmm::cuda_stream_view stream) { } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() { +inline int getSharedMemPerBlock() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() { +inline int getMultiProcessorCount() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } /** helper method to convert an array on device to a string on host */ template -std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { +std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) +{ std::stringstream ss; T* arr_h = (T*)malloc(size * sizeof(T)); @@ -354,53 +369,54 @@ std::string arr2Str(const T* arr, int size, std::string name, /** this seems to be unused, but may be useful in the future */ template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) { +void ASSERT_DEVICE_MEM(T* ptr, std::string name) +{ cudaPointerAttributes s_att; cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; + std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device + << ", err=" << s_err << std::endl; } -inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); +inline uint32_t curTimeMillis() +{ + auto now = std::chrono::high_resolution_clock::now(); auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); + return std::chrono::duration_cast(duration).count(); } /** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; -} + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } /** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ template -bool is_aligned(Type* ptr, size_t alignment) { +bool is_aligned(Type* ptr, size_t alignment) +{ return reinterpret_cast(ptr) % alignment == 0; } /** calculate greatest common divisor of two numbers -* @a integer -* @b integer -* @ return gcd of a and b -*/ + * @a integer + * @b integer + * @ return gcd of a and b + */ template -IntType gcd(IntType a, IntType b) { +IntType gcd(IntType a, IntType b) +{ while (b != 0) { IntType tmp = b; - b = a % b; - a = tmp; + b = a % b; + a = tmp; } return a; } diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index a4ebcc9900..e3b324d030 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -39,9 +39,9 @@ namespace detail { /* @brief binary `sum` operator */ struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs + rhs; } }; @@ -49,7 +49,8 @@ struct DeviceSum { /* @brief binary `min` operator */ struct DeviceMin { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs < rhs ? lhs : rhs; } }; @@ -57,43 +58,44 @@ struct DeviceMin { /* @brief binary `max` operator */ struct DeviceMax { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs > rhs ? lhs : rhs; } }; /* @brief binary `product` operator */ struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs * rhs; } }; /* @brief binary `and` operator */ struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs & rhs); } }; /* @brief binary `or` operator */ struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs | rhs); } }; /* @brief binary `xor` operator */ struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs ^ rhs); } }; @@ -103,9 +105,9 @@ struct DeviceXor { #define errmsg_cast "size mismatch." template -__forceinline__ __device__ T_output type_reinterpret(T_input value) { - static_assert(sizeof(T_output) == sizeof(T_input), - "type_reinterpret for different size"); +__forceinline__ __device__ T_output type_reinterpret(T_input value) +{ + static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); return *(reinterpret_cast(&value)); } @@ -118,25 +120,22 @@ struct genericAtomicOperationImpl; // single byte atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = - type_reinterpret(op(target_value, update_value)); - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = T((old >> shift) & 0xff); + uint8_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return T((old >> shift) & 0xff); @@ -146,26 +145,24 @@ struct genericAtomicOperationImpl { // 2 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { - using T_int = unsigned int; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { + using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = - type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) - ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + + T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -176,20 +173,18 @@ struct genericAtomicOperationImpl { // 4 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; T old_value = *addr; T assumed{old_value}; if constexpr (std::is_same{} && (std::is_same{})) { - if (isnan(update_value)) { - return old_value; - } + if (isnan(update_value)) { return old_value; } } do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -206,17 +201,13 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = float; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { - if (isnan(update_value)) { - return *addr; - } + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + if (isnan(update_value)) { return *addr; } - T old = - (update_value >= 0) - ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value))) - : __uint_as_float( - atomicMin((unsigned int*)addr, __float_as_uint(update_value))); + T old = (update_value >= 0) + ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value))) + : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(update_value))); return old; } @@ -225,8 +216,8 @@ struct genericAtomicOperationImpl { // 8 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -234,7 +225,7 @@ struct genericAtomicOperationImpl { T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -250,8 +241,8 @@ struct genericAtomicOperationImpl { // ------------------------------------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.) -// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int +// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is +// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int // CUDA natively supports `unsigned long long int` for `atomicAdd`, @@ -264,12 +255,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -277,12 +267,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -297,12 +286,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -310,12 +298,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMin op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -323,48 +310,44 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceAnd op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceOr op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceXor op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -377,13 +360,12 @@ struct typesAtomicCASImpl; template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -394,15 +376,14 @@ struct typesAtomicCASImpl { uint8_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = T((old >> shift) & 0xff); // have to compare `target_value` and `compare` before calling atomicCAS // the `target_value` in `old` can be different with `compare` if (target_value != compare) break; - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -411,13 +392,13 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; @@ -425,12 +406,12 @@ struct typesAtomicCASImpl { uint16_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); if (target_value != compare) break; - T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val - : (old & 0xffff) | (T_int(u_val) << 16); + T_int new_value = + (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); @@ -440,8 +421,8 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; T_int ret = atomicCAS(reinterpret_cast(addr), @@ -454,8 +435,8 @@ struct typesAtomicCASImpl { // 8 bytes atomic operation template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -487,11 +468,10 @@ struct typesAtomicCASImpl { * @returns The old value at `address` * -------------------------------------------------------------------------**/ template -typename std::enable_if_t::value, T> __forceinline__ - __device__ - genericAtomicOperation(T* address, T const& update_value, BinaryOp op) { - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; +typename std::enable_if_t::value, T> __forceinline__ __device__ +genericAtomicOperation(T* address, T const& update_value, BinaryOp op) +{ + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -499,11 +479,11 @@ typename std::enable_if_t::value, T> __forceinline__ template __forceinline__ __device__ bool genericAtomicOperation(bool* address, bool const& update_value, - BinaryOp op) { + BinaryOp op) +{ using T = bool; // don't use underlying type to apply operation for bool - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -525,9 +505,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceSum{}); +__forceinline__ __device__ T atomicAdd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); } /** @@ -546,9 +526,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMin{}); +__forceinline__ __device__ T atomicMin(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); } /** @@ -567,9 +547,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMax{}); +__forceinline__ __device__ T atomicMax(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); } /** @@ -589,9 +569,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, - val); +__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +{ + return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); } /** @@ -609,11 +589,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceAnd{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicAnd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); } /** @@ -631,11 +610,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) { - return raft::genericAtomicOperation(address, val, - raft::device_atomics::detail::DeviceOr{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicOr(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); } /** @@ -653,9 +631,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceXor{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicXor(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); } diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh index c4c384c45f..46edf0bf47 100644 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ b/cpp/include/raft/distance/detail/canberra.cuh @@ -45,75 +45,108 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch work */ -template -static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void canberraImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto add = raft::myAbs(x) + raft::myAbs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto canberraRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + auto canberraRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto canberraColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + auto canberraColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void canberra(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { canberraImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -138,16 +171,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void canberraImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - canberraOutType; + typedef typename std::conditional::type canberraOutType; Index_ lda, ldb, ldd; - canberraOutType *pDcast = reinterpret_cast(pD); + canberraOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; canberra( diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh index 77fba28310..99b314bd08 100644 --- a/cpp/include/raft/distance/detail/chebyshev.cuh +++ b/cpp/include/raft/distance/detail/chebyshev.cuh @@ -44,72 +44,105 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void chebyshevImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - acc = raft::myMax(acc, diff); + acc = raft::myMax(acc, diff); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto chebyshevRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevRowMajor); + auto chebyshevRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto chebyshevColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevColMajor); + auto chebyshevColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void chebyshev(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { chebyshevImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void chebyshevImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - chebyshevOutType; + typedef typename std::conditional::type chebyshevOutType; Index_ lda, ldb, ldd; - chebyshevOutType *pDcast = reinterpret_cast(pD); + chebyshevOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; chebyshev( diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh index cee986997a..159f9ab580 100644 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ b/cpp/include/raft/distance/detail/correlation.cuh @@ -47,69 +47,81 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, const DataT *x2n, const DataT *y2n, - IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void correlationImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const DataT* x2n, + const DataT* y2n, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation auto epilog_lambda = [x2n, y2n, m, n, k] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh]; extern __shared__ char smem[]; - DataT *sx2Norm = - (DataT *)(&smem[KPolicy::SmemSize + - (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); - DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]); + DataT* sx2Norm = + (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); + DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]); // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * KPolicy::Nblk) { for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sx2Norm[i] = idx < m ? x2n[idx] : 0; } } for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; sy2Norm[i] = idx < n ? y2n[idx] : 0; } __syncthreads(); #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { - regx2n[i] = - sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; + regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; } #pragma unroll for (int i = 0; i < KPolicy::AccColsPerTh; ++i) { - regy2n[i] = - sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; + regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; } #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); + auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); @@ -121,46 +133,68 @@ static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - constexpr auto correlationRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - correlationRowMajor); + constexpr auto correlationRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationRowMajor); correlationRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - constexpr auto correlationColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - correlationColMajor); + constexpr auto correlationColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationColMajor); correlationColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, const DataT *x2n, const DataT *y2n, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void correlation(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const DataT* x2n, + const DataT* y2n, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + correlationImpl( + x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + correlationImpl( + x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { correlationImpl( x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -185,63 +219,118 @@ void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, AccType *workspace, size_t &worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void correlationImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - correlationOutType; + typedef typename std::conditional::type correlationOutType; Index_ lda, ldb, ldd; - correlationOutType *pDcast = reinterpret_cast(pD); + correlationOutType* pDcast = reinterpret_cast(pD); ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) || (worksize < 2 * m * sizeof(AccType))), "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); - AccType *norm_col_vec = workspace; - AccType *norm_row_vec = workspace; - AccType *sq_norm_col_vec = workspace; - AccType *sq_norm_row_vec = workspace; + AccType* norm_col_vec = workspace; + AccType* norm_row_vec = workspace; + AccType* sq_norm_col_vec = workspace; + AccType* sq_norm_row_vec = workspace; if (pA != pB) { norm_row_vec += m; - raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_col_vec, + pA, + k, + m, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); - raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_row_vec, + pB, + k, + n, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); sq_norm_col_vec += (m + n); sq_norm_row_vec = sq_norm_col_vec + m; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, - isRowMajor, stream); - raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, - isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream); } else { - raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_col_vec, + pA, + k, + m, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); sq_norm_col_vec += m; sq_norm_row_vec = sq_norm_col_vec; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, - isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); } if (isRowMajor) { lda = k, ldb = k, ldd = n; - correlation( - m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec, - sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream); + correlation(m, + n, + k, + lda, + ldb, + ldd, + pA, + pB, + norm_col_vec, + norm_row_vec, + sq_norm_col_vec, + sq_norm_row_vec, + pDcast, + fin_op, + stream); } else { lda = n, ldb = m, ldd = m; - correlation(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec, - norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast, - fin_op, stream); + correlation(n, + m, + k, + lda, + ldb, + ldd, + pB, + pA, + norm_row_vec, + norm_col_vec, + sq_norm_row_vec, + sq_norm_col_vec, + pDcast, + fin_op, + stream); } } diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh index 900e045edc..5684fd0a16 100644 --- a/cpp/include/raft/distance/detail/cosine.cuh +++ b/cpp/include/raft/distance/detail/cosine.cuh @@ -25,7 +25,7 @@ namespace detail { /** * @brief the cosine distance matrix calculation implementer - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -50,30 +50,43 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void cosineImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -86,43 +99,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto cosineRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + auto cosineRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto cosineColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + auto cosineColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, const DataT *yn, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void cosine(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { cosineImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -131,7 +167,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the expanded cosine distance matrix calculation - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam IType input data-type (for A and B matrices) * @tparam AccType accumulation data-type @@ -152,12 +188,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, AccType *workspace, - size_t worksize, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void cosineAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); }; // Wrap fin_op to allow computing 1 - pA before calling fin_op @@ -166,39 +213,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, }; typedef std::is_same is_bool; - typedef typename std::conditional::type - CosOutType; - CosOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type CosOutType; + CosOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, - stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream); } else { lda = n, ldb = m, ldd = m; - cosine(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, - wrapped_fin_op, stream); + cosine( + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream); } } diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index 199dc73fb6..91838e8bfa 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -85,211 +85,461 @@ enum DistanceType : unsigned short { }; namespace { -template struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {} + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg = 2.0f) + { + } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo1( - m, n, k, x, y, dist, false, (AccType *)workspace, worksize, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo1( + m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo1( - m, n, k, x, y, dist, true, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo1( + m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::cosineAlgo1(m, n, k, x, y, dist, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::cosineAlgo1( + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo2( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo2( m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo2( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo2( m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::l1Impl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::l1Impl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::chebyshevImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::chebyshevImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::hellingerImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::hellingerImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType metric_arg) { - raft::distance::detail::minkowskiImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::detail::minkowskiImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::canberraImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::canberraImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::hammingUnexpandedImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::hammingUnexpandedImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::jensenShannonImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::jensenShannonImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::russellRaoImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::russellRaoImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::klDivergenceImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::klDivergenceImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::correlationImpl( - m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::correlationImpl( + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; @@ -320,53 +570,71 @@ struct DistanceImplOutType fin_op(AccType in, int g_idx);. If one needs * any other parameters, feel free to pass them via closure. */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - DistanceImpl - distImpl; - distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, - isRowMajor, metric_arg); +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + DistanceImpl distImpl; + distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } /** - * @brief Evaluate pairwise distances for the simple use case - * @tparam DistanceType which distance to evaluate - * @tparam InType input argument type - * @tparam AccType accumulation type - * @tparam OutType output type - * @tparam Index_ Index type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - * - * @note if workspace is passed as nullptr, this will return in - * worksize, the number of bytes of workspace required - */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { - return d_val; - }; - distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, - stream, isRowMajor, metric_arg); + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param workspace temporary workspace needed for computations + * @param worksize number of bytes of the workspace + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; + distance( + x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -386,14 +654,16 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @note If the specifed distanceType doesn't need the workspace at all, it * returns 0. */ -template -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - size_t worksize = 0; - constexpr bool is_allocated = - (distanceType <= raft::distance::DistanceType::CosineExpanded) || - (distanceType == raft::distance::DistanceType::CorrelationExpanded); +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + size_t worksize = 0; + constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) || + (distanceType == raft::distance::DistanceType::CorrelationExpanded); constexpr int numOfBuffers = (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1; @@ -425,17 +695,21 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - rmm::device_uvector &workspace, - cudaStream_t stream, bool isRowMajor, - Type metric_arg = 2.0f) { - auto worksize = - getWorkspaceSize(x, y, m, n, k); +void pairwise_distance_impl(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + rmm::device_uvector& workspace, + cudaStream_t stream, + bool isRowMajor, + Type metric_arg = 2.0f) +{ + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance(x, y, dist, m, n, k, - workspace.data(), worksize, - stream, isRowMajor, metric_arg); + distance( + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } /** @} */ }; // namespace detail diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh index 8b8882c244..1166543f8c 100644 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ b/cpp/include/raft/distance/detail/euclidean.cuh @@ -49,30 +49,44 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, - IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanExpImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -94,47 +108,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto euclideanExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + auto euclideanExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + auto euclideanExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void euclideanExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanExpImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -162,53 +197,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - AccType *workspace, size_t &worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(InType in) { return in; }; typedef std::is_same is_bool; - typedef typename std::conditional::type - ExpOutType; - ExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type ExpOutType; + ExpOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, - fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, - fin_op, stream); + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); } } /** - * @brief the unexpanded euclidean distance matrix calculation + * @brief the unexpanded euclidean distance matrix calculation * It computes the following equation: cij = op((ai-bj)^2) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -228,16 +269,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -248,10 +303,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -264,48 +320,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto euclideanUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpRowMajor); + auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpColMajor); + auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanUnExpImpl( x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -331,15 +407,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo2(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - UnExpOutType; - UnExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type UnExpOutType; + UnExpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index ca8f729a68..9373992ada 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -36,24 +36,24 @@ template struct KVPMinReduceImpl { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template struct MinAndDistanceReduceOpImpl { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) { + DI void operator()(LabelT rid, KVP* out, const KVP& other) + { if (other.value < out->value) { - out->key = other.key; + out->key = other.key; out->value = other.value; } } - DI void init(KVP* out, DataT maxVal) { - out->key = -1; + DI void init(KVP* out, DataT maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -61,38 +61,35 @@ struct MinAndDistanceReduceOpImpl { template struct MinReduceOpImpl { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) { - if (other.value < *out) { - *out = other.value; - } + DI void operator()(LabelT rid, DataT* out, const KVP& other) + { + if (other.value < *out) { *out = other.value; } } DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { - redOp.init(min + tid, maxVal); - } + if (tid < m) { redOp.init(min + tid, maxVal); } } template -void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, - cudaStream_t stream) { +void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream) +{ auto blks = raft::ceildiv(m, 256); - initKernel - <<>>(min, m, maxVal, redOp); + initKernel<<>>(min, m, maxVal, redOp); } // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls -template -DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, - IdxT m, IdxT gridStrideY) { - const auto lid = threadIdx.x % raft::WarpSize; +template +DI void updateReducedVal( + int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) +{ + const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This @@ -117,21 +114,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; + val[i] = {tmpkey, tmpvalue}; } } } } -template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( - OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, - IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, - KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair KVPair; @@ -144,7 +158,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); @@ -173,72 +189,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; + KVPair tmp = {tmpkey, acc[i][j]}; if (tmpkey < n) { - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, - maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); + auto rowEpilog_lambda = + [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); // reduce #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + for (int j = P::AccThCols / 2; j > 0; j >>= 1) { + auto tmpkey = raft::shfl(val[i].key, lid + j); + auto tmpvalue = raft::shfl(val[i].value, lid + j); + KVPair tmp = {tmpkey, tmpvalue}; + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + } } - } - updateReducedVal(mutex, min, val, red_op, - m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, m, gridStrideY); // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } + }; IdxT lda = k, ldb = k, ldd = n; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + xn, + yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NNImpl(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + int* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ typedef typename linalg::Policy4x4::Policy P; dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { @@ -249,25 +298,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - constexpr size_t shmemSize = - P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - auto fusedL2NNSqrt = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + auto fusedL2NNSqrt = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); fusedL2NNSqrt<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } else { - auto fusedL2NN = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); - fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, - maxVal, workspace, redOp, - pairRedOp, core_lambda, fin_op); + auto fusedL2NN = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh index 0169ba33a2..886b9d1426 100644 --- a/cpp/include/raft/distance/detail/hamming.cuh +++ b/cpp/include/raft/distance/detail/hamming.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Hamming distance matrix using the unexpanded form: - * It computes the following equation: + * It computes the following equation: Cij = sum(x_i != y_i) / k * * @tparam DataT input data-type (for A and B matrices) @@ -47,30 +47,41 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void hammingUnexpandedImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += (x != y); - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [k] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const DataT one_over_k = DataT(1.0) / k; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -82,46 +93,65 @@ static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m, }; if (isRowMajor) { - auto hammingUnexpandedRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hammingUnexpandedRowMajor); + auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor); hammingUnexpandedRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hammingUnexpandedColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hammingUnexpandedColMajor); + auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor); hammingUnexpandedColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hammingUnexpanded(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + hammingUnexpandedImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + hammingUnexpandedImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hammingUnexpandedImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,7 +160,7 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hamming Unexpanded distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = sum(x_i != y_i) / k * * @tparam InType input data-type (for A and B matrices) @@ -148,28 +178,35 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hammingUnexpandedImpl(int m, int n, int k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void hammingUnexpandedImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hammingUnexpandedOutType; + typedef + typename std::conditional::type hammingUnexpandedOutType; Index_ lda, ldb, ldd; - hammingUnexpandedOutType *pDcast = - reinterpret_cast(pD); + hammingUnexpandedOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - hammingUnexpanded(m, n, k, lda, ldb, ldd, pA, pB, pDcast, - fin_op, stream); + hammingUnexpanded( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - hammingUnexpanded(n, m, k, lda, ldb, ldd, pB, pA, - pDcast, fin_op, stream); + hammingUnexpanded( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh index 933d850dbf..189bbed491 100644 --- a/cpp/include/raft/distance/detail/hellinger.cuh +++ b/cpp/include/raft/distance/detail/hellinger.cuh @@ -24,7 +24,7 @@ namespace detail { /** * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: + * It computes the following equation: cij = sqrt(1 - sum(sqrt(x_k * y_k))) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -52,29 +52,40 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void hellingerImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); - auto unaryOp_lambda = [] __device__(DataT input) { - return raft::mySqrt(input); - }; + auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); }; // First sqrt x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } // Accumulation operation lambda @@ -85,71 +96,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); + const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::mySqrt(rectifier * finalVal); } } }; if (isRowMajor) { - auto hellingerRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerRowMajor); + auto hellingerRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hellingerColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerColMajor); + auto hellingerColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } // Revert sqrt of x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } CUDA_CHECK(cudaGetLastError()); } -template -void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hellinger(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hellingerImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -158,7 +189,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hellinger distance matrix calculation - * It computes the following equation: + * It computes the following equation: sqrt(1 - sum(sqrt(x_k * y_k)) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -180,16 +211,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void hellingerImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hellingerOutType; + typedef typename std::conditional::type hellingerOutType; Index_ lda, ldb, ldd; - hellingerOutType *pDcast = reinterpret_cast(pD); + hellingerOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; hellinger( diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh index 1e39f39682..b3240fe398 100644 --- a/cpp/include/raft/distance/detail/jensen_shannon.cuh +++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Jensen Shannon distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) * @@ -48,37 +48,49 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void jensenShannonImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const DataT m = 0.5f * (x + y); + const DataT m = 0.5f * (x + y); const bool m_zero = (m == 0); - const auto logM = (!m_zero) * raft::myLog(m + m_zero); + const auto logM = (!m_zero) * raft::myLog(m + m_zero); const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += (-x * (logM - raft::myLog(x + x_zero))) + - (-y * (logM - raft::myLog(y + y_zero))); + acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -89,46 +101,65 @@ static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - auto jensenShannonRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - jensenShannonRowMajor); + auto jensenShannonRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonRowMajor); jensenShannonRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto jensenShannonColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - jensenShannonColMajor); + auto jensenShannonColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonColMajor); jensenShannonColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void jensenShannon(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + jensenShannonImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + jensenShannonImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { jensenShannonImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -137,7 +168,7 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Jensen Shannon distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) * @@ -156,26 +187,34 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void jensenShannonImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - jensenShannonOutType; + typedef typename std::conditional::type jensenShannonOutType; Index_ lda, ldb, ldd; - jensenShannonOutType *pDcast = reinterpret_cast(pD); + jensenShannonOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - jensenShannon(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + jensenShannon( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - jensenShannon(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, - stream); + jensenShannon( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } } // namespace detail diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh index 5a18ba1670..31127a4d8d 100644 --- a/cpp/include/raft/distance/detail/kl_divergence.cuh +++ b/cpp/include/raft/distance/detail/kl_divergence.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the KL Divergence distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = 0.5 * sum(x * log (x / y)); * This distance computation modifies A or B by computing a log(x) * and then performing a `pow(e, log(x))` to convert it back. Because of this, @@ -51,17 +51,29 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void klDivergenceImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -80,13 +92,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, if (isRowMajor) { const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += - x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); + acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); } else { const bool y_zero = (y == 0); const bool x_zero = (x == 0); - acc += - y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); + acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); } }; @@ -102,10 +112,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -116,79 +127,158 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - constexpr auto klDivergenceRowMajor = - pairwiseDistanceMatKernel; + constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel; constexpr auto klDivergenceRowMajorXequalY = - pairwiseDistanceMatKernel; + decltype(epilog_lambda), + FinalLambda, + true>; if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceRowMajor); - klDivergenceRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + (DataT*)y, y, n * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajor); + klDivergenceRowMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda, + epilog_lambda, + fin_op); // Now reverse previous log (x) back to x using (e ^ log(x)) raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream); + (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream); } else { - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceRowMajorXequalY); - klDivergenceRowMajorXequalY<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, - core_lambda_x_equal_y, epilog_lambda, fin_op); + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY); + klDivergenceRowMajorXequalY<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda_x_equal_y, + epilog_lambda, + fin_op); } } else { - constexpr auto klDivergenceColMajor = - pairwiseDistanceMatKernel; + constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel; constexpr auto klDivergenceColMajorXequalY = - pairwiseDistanceMatKernel; + decltype(epilog_lambda), + FinalLambda, + false>; if (x != y) { raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceColMajor); - klDivergenceColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + (DataT*)x, x, m * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajor); + klDivergenceColMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda, + epilog_lambda, + fin_op); // Now reverse previous log (x) back to x using (e ^ log(x)) raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream); + (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream); } else { - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceColMajorXequalY); - klDivergenceColMajorXequalY<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, - core_lambda_x_equal_y, epilog_lambda, fin_op); + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY); + klDivergenceColMajorXequalY<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda_x_equal_y, + epilog_lambda, + fin_op); } } CUDA_CHECK(cudaGetLastError()); } -template -void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void klDivergence(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + klDivergenceImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + klDivergenceImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { klDivergenceImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -197,7 +287,7 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the KL Divergence distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = 0.5 * sum(x * log (x / y)); * This distance computation modifies A or B by computing a log(x) * and then performing a `pow(e, log(x))` to convert it back. Because of this, @@ -218,25 +308,34 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void klDivergenceImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - klDivergenceOutType; + typedef typename std::conditional::type klDivergenceOutType; Index_ lda, ldb, ldd; - klDivergenceOutType *pDcast = reinterpret_cast(pD); + klDivergenceOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - klDivergence(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + klDivergence( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - klDivergence(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + klDivergence( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } } // namespace detail diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh index 33e9bae206..e444e65d1f 100644 --- a/cpp/include/raft/distance/detail/l1.cuh +++ b/cpp/include/raft/distance/detail/l1.cuh @@ -43,16 +43,29 @@ namespace detail { * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void l1Impl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -63,47 +76,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto l1RowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + auto l1RowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto l1ColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + auto l1ColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, - const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void l1(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + l1Impl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { l1Impl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -131,16 +166,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void l1Impl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void l1Impl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef - typename std::conditional::type L1OutType; + typedef typename std::conditional::type L1OutType; Index_ lda, ldb, ldd; - L1OutType *pDcast = reinterpret_cast(pD); + L1OutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; l1( diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh index 8bd3deb08f..22a183c22c 100644 --- a/cpp/include/raft/distance/detail/minkowski.cuh +++ b/cpp/include/raft/distance/detail/minkowski.cuh @@ -22,7 +22,7 @@ namespace distance { namespace detail { /** - * @brief the unexpanded Minkowski distance matrix calculation + * @brief the unexpanded Minkowski distance matrix calculation * It computes the following equation: cij = sum(|x - y|^p)^(1/p) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -45,16 +45,30 @@ namespace detail { * @param[in] stream cuda stream to launch work * @param[in] the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT p) { +template +void minkowskiUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT p) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -65,10 +79,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const auto one_over_p = 1.0f / p; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -80,48 +95,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto minkowskiUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpRowMajor); + auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto minkowskiUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpColMajor); + auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { +template +void minkowskiUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT metric_arg) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else { minkowskiUnExpImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); @@ -147,15 +182,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] isRowMajor whether the input and output matrices are row major * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +void minkowskiImpl(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - LpUnexpOutType; - LpUnexpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type LpUnexpOutType; + LpUnexpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh index a98bda1541..8fa7801c70 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh @@ -34,11 +34,11 @@ namespace detail { * @tparam OutT output data-type (for C and D matrices) * @tparam IdxT index data-type * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into + * @tparam CoreLambda tells how to accumulate an x and y into acc. its signature: template void core_lambda(AccT& acc, const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final + * @tparam EpilogueLambda applies an elementwise function to compute final values. Its signature is: template void epilogue_lambda (AccT acc[][], DataT* regxn, DataT* regyn); @@ -60,21 +60,27 @@ namespace detail { * @param fin_op the final gemm epilogue lambda */ -template > +template > struct PairwiseDistances : public BaseClass { private: typedef Policy P; - const DataT *xn; - const DataT *yn; - const DataT *const yBase; - OutT *dOutput; - char *smem; + const DataT* xn; + const DataT* yn; + const DataT* const yBase; + OutT* dOutput; + char* smem; CoreLambda core_op; EpilogueLambda epilog_op; FinalLambda fin_op; @@ -84,11 +90,21 @@ struct PairwiseDistances : public BaseClass { public: // Constructor - DI PairwiseDistances(const DataT *_x, const DataT *_y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, - const DataT *_xn, const DataT *_yn, OutT *_dOutput, - char *_smem, CoreLambda _core_op, - EpilogueLambda _epilog_op, FinalLambda _fin_op, + DI PairwiseDistances(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + const DataT* _xn, + const DataT* _yn, + OutT* _dOutput, + char* _smem, + CoreLambda _core_op, + EpilogueLambda _epilog_op, + FinalLambda _fin_op, rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), @@ -99,9 +115,12 @@ struct PairwiseDistances : public BaseClass { core_op(_core_op), epilog_op(_epilog_op), fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) {} + rowEpilog_op(_rowEpilog_op) + { + } - DI void run() { + DI void run() + { for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; @@ -115,7 +134,8 @@ struct PairwiseDistances : public BaseClass { } private: - DI void updateIndicesY() { + DI void updateIndicesY() + { const auto stride = P::Nblk * gridDim.x; if (isRowMajor) { this->y += stride * this->ldb; @@ -125,21 +145,23 @@ struct PairwiseDistances : public BaseClass { this->yrowid += stride; } - DI void updateIndicesXY() { + DI void updateIndicesXY() + { const auto stride = P::Mblk * gridDim.y; if (isRowMajor) { this->x += stride * this->lda; this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; - this->y = yBase + this->yrowid * this->ldb; + this->y = yBase + this->yrowid * this->ldb; } else { this->x += stride; this->yrowid = IdxT(blockIdx.x) * P::Nblk; - this->y = yBase + this->yrowid + this->srowid * this->ldb; + this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; } - DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) { + DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) + { // Fetch next grid stride ldg if within range if ((gridStrideX + gridDim.x * P::Nblk) < this->n) { updateIndicesY(); @@ -150,10 +172,9 @@ struct PairwiseDistances : public BaseClass { } } - DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { - if (gridStrideX == blockIdx.x * P::Nblk) { - this->ldgXY(0); - } + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) + { + if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -168,7 +189,8 @@ struct PairwiseDistances : public BaseClass { this->pageWr ^= 1; } - DI void loop() { + DI void loop() + { for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { this->ldgXY(kidx); accumulate(); // on the previous k-block @@ -185,7 +207,8 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } - DI void accumulate() { + DI void accumulate() + { #pragma unroll for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { this->ldsXY(ki); @@ -202,21 +225,22 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) + { if (useNorms) { - DataT *sxNorm = (DataT *)(&smem[P::SmemSize]); - DataT *syNorm = (&sxNorm[P::Mblk]); + DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); + DataT* syNorm = (&sxNorm[P::Mblk]); // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * P::Nblk) { for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -291,41 +315,68 @@ struct PairwiseDistances : public BaseClass { * @param fin_op the final gemm epilogue lambda */ -template +template __global__ __launch_bounds__(Policy::Nthreads, 2) - void pairwiseDistanceMatKernel(const DataT *x, const DataT *y, - const DataT *_xn, const DataT *_yn, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, CoreLambda core_op, - EpilogueLambda epilog_op, FinalLambda fin_op) { + void pairwiseDistanceMatKernel(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + CoreLambda core_op, + EpilogueLambda epilog_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; auto rowEpilog = [] __device__(IdxT starty) { return; }; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, - epilog_op, fin_op, rowEpilog); + PairwiseDistances + obj( + x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); obj.run(); } template -dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) { - const auto numSMs = raft::getMultiProcessorCount(); +dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) +{ + const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; - CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, func, P::Nthreads, sMemSize)); + CUDA_CHECK( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); std::size_t minGridSize = numSMs * numBlocksPerSm; - std::size_t yChunks = raft::ceildiv(m, P::Mblk); - std::size_t xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + std::size_t yChunks = raft::ceildiv(m, P::Mblk); + std::size_t xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { std::size_t i = 1; while (grid.y * i < minGridSize) { diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh index 8e4c4824c3..d4fbb039e7 100644 --- a/cpp/include/raft/distance/detail/russell_rao.cuh +++ b/cpp/include/raft/distance/detail/russell_rao.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Russell Rao distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = (k - sum(x_i * y_i)) / k * * @tparam DataT input data-type (for A and B matrices) @@ -47,29 +47,42 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void russellRaoImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; const float one_over_k = 1.0 / k; // epilogue operation lambda for final value calculation auto epilog_lambda = [k, one_over_k] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -81,46 +94,65 @@ static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - constexpr auto russellRaoRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - russellRaoRowMajor); + constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoRowMajor); russellRaoRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - constexpr auto russellRaoColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - russellRaoColMajor); + constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoColMajor); russellRaoColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void russellRao(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + russellRaoImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + russellRaoImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { russellRaoImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -129,7 +161,7 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Russell Rao distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = (k - sum(x_i * y_i)) / k * * @tparam InType input data-type (for A and B matrices) @@ -147,16 +179,25 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void russellRaoImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - russellRaoOutType; + typedef typename std::conditional::type russellRaoOutType; Index_ lda, ldb, ldd; - russellRaoOutType *pDcast = reinterpret_cast(pD); + russellRaoOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; russellRao( diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp index 8b55543ff8..66832c12d2 100644 --- a/cpp/include/raft/distance/distance.hpp +++ b/cpp/include/raft/distance/distance.hpp @@ -25,132 +25,163 @@ namespace raft { namespace distance { /** -* @brief Evaluate pairwise distances with the user epilogue lamba allowed -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam FinalLambda user-defined epilogue lamba -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param workspace temporary workspace needed for computations -* @param worksize number of bytes of the workspace -* @param fin_op the final gemm epilogue lambda -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note fin_op: This is a device lambda which is supposed to operate upon the -* input which is AccType and returns the output in OutType. It's signature is -* as follows:

OutType fin_op(AccType in, int g_idx);
. If one needs -* any other parameters, feel free to pass them via closure. -*/ -template OutType fin_op(AccType in, int g_idx);. If one needs + * any other parameters, feel free to pass them via closure. + */ +template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ detail::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } /** -* @brief Evaluate pairwise distances for the simple use case -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param workspace temporary workspace needed for computations -* @param worksize number of bytes of the workspace -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note if workspace is passed as nullptr, this will return in -* worksize, the number of bytes of workspace required -*/ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param workspace temporary workspace needed for computations + * @param worksize number of bytes of the workspace + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * @param metric_arg metric argument (used for Minkowski distance) + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ detail::distance( x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg); } /** -* @brief Return the exact workspace size to compute the distance -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* -* @note If the specified distanceType doesn't need the workspace at all, it -* returns 0. -*/ -template -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - return detail::getWorkspaceSize(x, y, m, n, k); + * @brief Return the exact workspace size to compute the distance + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * + * @note If the specified distanceType doesn't need the workspace at all, it + * returns 0. + */ +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + return detail::getWorkspaceSize(x, y, m, n, k); } /** -* @brief Evaluate pairwise distances for the simple use case -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note if workspace is passed as nullptr, this will return in -* worksize, the number of bytes of workspace required -*/ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * @param metric_arg metric argument (used for Minkowski distance) + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ rmm::device_uvector workspace(0, stream); - auto worksize = - getWorkspaceSize(x, y, m, n, - k); + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); detail::distance( - x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } /** @@ -173,119 +204,117 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance(const raft::handle_t &handle, const Type *x, - const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, - rmm::device_uvector &workspace, +void pairwise_distance(const raft::handle_t& handle, + const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + rmm::device_uvector& workspace, raft::distance::DistanceType metric, - bool isRowMajor = true, Type metric_arg = 2.0f) { + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2SqrtExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::CosineExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L1: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2Unexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2Unexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::Linf: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::HellingerExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::LpUnexpanded>( - x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, - metric_arg); + detail::pairwise_distance_impl( + x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg); break; case raft::distance::DistanceType::Canberra: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::HammingUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::HammingUnexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::JensenShannon: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::JensenShannon>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::RusselRaoExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::KLDivergence: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::KLDivergence>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::CorrelationExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::CorrelationExpanded>( - x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); + detail:: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; - default: - THROW("Unknown or unsupported distance metric '%d'!", (int)metric); + default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } /** @} */ /** - * @defgroup pairwise_distance pairwise distance prims - * @{ - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam Index_ indexing type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param metric distance metric - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - */ + * @defgroup pairwise_distance pairwise distance prims + * @{ + * @brief Convenience wrapper around 'distance' prim to convert runtime metric + * into compile time for the purpose of dispatch + * @tparam Type input/accumulation/output data-type + * @tparam Index_ indexing type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param metric distance metric + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + */ template -void pairwise_distance(const raft::handle_t &handle, const Type *x, - const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, +void pairwise_distance(const raft::handle_t& handle, + const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, raft::distance::DistanceType metric, - bool isRowMajor = true, Type metric_arg = 2.0f) { + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ rmm::device_uvector workspace(0, handle.get_stream()); - pairwise_distance(handle, x, y, dist, m, n, k, workspace, - metric, isRowMajor, metric_arg); + pairwise_distance( + handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg); } }; // namespace distance diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp index 0a730506c8..d924ef217c 100644 --- a/cpp/include/raft/distance/fused_l2_nn.hpp +++ b/cpp/include/raft/distance/fused_l2_nn.hpp @@ -30,8 +30,7 @@ template using KVPMinReduce = detail::KVPMinReduceImpl; template -using MinAndDistanceReduceOp = - detail::MinAndDistanceReduceOpImpl; +using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl; template using MinReduceOp = detail::MinReduceOpImpl; @@ -40,10 +39,9 @@ using MinReduceOp = detail::MinReduceOpImpl; * Initialize array using init value from reduction op */ template -void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, - ReduceOpT redOp) { - detail::initialize(min, m, maxVal, redOp, - handle.get_stream()); +void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ + detail::initialize(min, m, maxVal, redOp, handle.get_stream()); } /** @@ -82,25 +80,32 @@ void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, * main kernel launch * @param[in] stream cuda stream */ -template -void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NN(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + void* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } } diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index c62f2e5f79..773b83ab13 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -31,14 +31,14 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(exception const& src) noexcept - : std::exception(), msg_(src.what()) { + exception(exception const& src) noexcept : std::exception(), msg_(src.what()) + { collect_call_stack(); } /** ctor from an input message */ - explicit exception(std::string const msg) noexcept - : std::exception(), msg_(std::move(msg)) { + explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg)) + { collect_call_stack(); } @@ -51,7 +51,8 @@ class exception : public std::exception { /** append call stack info to this exception's message for ease of debug */ // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept { + void collect_call_stack() noexcept + { #ifdef __GNUC__ constexpr int kMaxStackDepth = 64; void* stack[kMaxStackDepth]; // NOLINT @@ -90,16 +91,16 @@ struct logic_error : public raft::exception { // FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf( \ + errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ } while (0) // FIXME: Need to be replaced with RAFT_EXPECTS @@ -109,16 +110,15 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ - __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 794951ca9c..70fff1e210 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -61,34 +61,30 @@ class handle_t { int cur_dev = -1; CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; - }()) { - if (n_streams != 0) { - streams_ = std::make_unique(n_streams); - } + }()) + { + if (n_streams != 0) { streams_ = std::make_unique(n_streams); } create_resources(); thrust_policy_ = std::make_unique(user_stream_); } /** - * @brief Construct a light handle copy from another + * @brief Construct a light handle copy from another * user stream, cuda handles, comms and worker pool are not copied - * The user_stream of the returned handle is set to the specified stream + * The user_stream of the returned handle is set to the specified stream * of the other handle worker pool * @param[in] other other handle for which to use streams - * @param[in] stream_id stream id in `other` worker streams + * @param[in] stream_id stream id in `other` worker streams * to be set as user stream in the constructed handle * @param[in] n_streams number worker streams to be created */ - handle_t(const handle_t& other, int stream_id, - int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()) { - RAFT_EXPECTS( - other.get_num_internal_streams() > 0, - "ERROR: the main handle must have at least one worker stream\n"); - if (n_streams != 0) { - streams_ = std::make_unique(n_streams); - } - prop_ = other.get_device_properties(); + handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()) + { + RAFT_EXPECTS(other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); + if (n_streams != 0) { streams_ = std::make_unique(n_streams); } + prop_ = other.get_device_properties(); device_prop_initialized_ = true; create_resources(); set_stream(other.get_internal_stream(stream_id)); @@ -102,11 +98,10 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } - rmm::cuda_stream_view get_stream_view() const { - return rmm::cuda_stream_view(user_stream_); - } + rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } - cublasHandle_t get_cublas_handle() const { + cublasHandle_t get_cublas_handle() const + { std::lock_guard _(mutex_); if (!cublas_initialized_) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); @@ -115,7 +110,8 @@ class handle_t { return cublas_handle_; } - cusolverDnHandle_t get_cusolver_dn_handle() const { + cusolverDnHandle_t get_cusolver_dn_handle() const + { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); @@ -124,7 +120,8 @@ class handle_t { return cusolver_dn_handle_; } - cusolverSpHandle_t get_cusolver_sp_handle() const { + cusolverSpHandle_t get_cusolver_sp_handle() const + { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); @@ -133,7 +130,8 @@ class handle_t { return cusolver_sp_handle_; } - cusparseHandle_t get_cusparse_handle() const { + cusparseHandle_t get_cusparse_handle() const + { std::lock_guard _(mutex_); if (!cusparse_initialized_) { CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -145,25 +143,27 @@ class handle_t { rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } // legacy compatibility for cuML - cudaStream_t get_internal_stream(int sid) const { - RAFT_EXPECTS( - streams_.get() != nullptr, - "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + cudaStream_t get_internal_stream(int sid) const + { + RAFT_EXPECTS(streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); return streams_->get_stream(sid).value(); } // new accessor return rmm::cuda_stream_view - rmm::cuda_stream_view get_internal_stream_view(int sid) const { - RAFT_EXPECTS( - streams_.get() != nullptr, - "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + rmm::cuda_stream_view get_internal_stream_view(int sid) const + { + RAFT_EXPECTS(streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); return streams_->get_stream(sid); } - int get_num_internal_streams() const { + int get_num_internal_streams() const + { return streams_.get() != nullptr ? streams_->get_pool_size() : 0; } - std::vector get_internal_streams() const { + std::vector get_internal_streams() const + { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { int_streams_vec.push_back(get_internal_stream(i)); @@ -171,49 +171,51 @@ class handle_t { return int_streams_vec; } - void wait_on_user_stream() const { + void wait_on_user_stream() const + { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } - void wait_on_internal_streams() const { + void wait_on_internal_streams() const + { for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } - void set_comms(std::shared_ptr communicator) { - communicator_ = communicator; - } + void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } - const comms::comms_t& get_comms() const { - RAFT_EXPECTS(this->comms_initialized(), - "ERROR: Communicator was not initialized\n"); + const comms::comms_t& get_comms() const + { + RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); return *communicator_; } - void set_subcomm(std::string key, std::shared_ptr subcomm) { + void set_subcomm(std::string key, std::shared_ptr subcomm) + { subcomms_[key] = subcomm; } - const comms::comms_t& get_subcomm(std::string key) const { - RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(), - "%s was not found in subcommunicators.", key.c_str()); + const comms::comms_t& get_subcomm(std::string key) const + { + RAFT_EXPECTS( + subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); auto subcomm = subcomms_.at(key); - RAFT_EXPECTS(nullptr != subcomm.get(), - "ERROR: Subcommunicator was not initialized"); + RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); return *subcomm; } bool comms_initialized() const { return (nullptr != communicator_.get()); } - const cudaDeviceProp& get_device_properties() const { + const cudaDeviceProp& get_device_properties() const + { std::lock_guard _(mutex_); if (!device_prop_initialized_) { CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); @@ -243,29 +245,28 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { - CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } + void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } - void destroy_resources() { + void destroy_resources() + { ///@todo: enable *_NO_THROW variants once we have enabled logging if (cusparse_initialized_) { - //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); } if (cusolver_dn_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); } if (cusolver_sp_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); } if (cublas_initialized_) { - //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } }; // class handle_t @@ -275,7 +276,8 @@ class handle_t { */ class stream_syncer { public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) { + explicit stream_syncer(const handle_t& handle) : handle_(handle) + { handle_.wait_on_user_stream(); } ~stream_syncer() { handle_.wait_on_internal_streams(); } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index a7cfb9287b..5fc56de14b 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -34,15 +34,13 @@ namespace raft { * `modulus` is positive. */ template -inline S round_up_safe(S number_to_round, S modulus) { +inline S round_up_safe(S number_to_round, S modulus) +{ auto remainder = number_to_round % modulus; - if (remainder == 0) { - return number_to_round; - } + if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument( - "Attempt to round up beyond the type's maximum value"); + throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); } return rounded_up; } @@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) { * `modulus` is positive. */ template -inline S round_down_safe(S number_to_round, S modulus) { - auto remainder = number_to_round % modulus; +inline S round_down_safe(S number_to_round, S modulus) +{ + auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } @@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) { * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, - const T& divisor) noexcept { +constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +{ return (dividend + divisor - 1) / divisor; } namespace detail { template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { + I dividend, + I divisor) noexcept +{ // TODO: This could probably be implemented faster - return (dividend > divisor) - ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); + return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); } template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { - auto quotient = dividend / divisor; + I dividend, + I divisor) noexcept +{ + auto quotient = dividend / divisor; auto remainder = dividend % divisor; return quotient + (remainder != 0); } @@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline std::enable_if_t::value, I> -div_rounding_up_safe(I dividend, I divisor) noexcept { - using i_is_a_signed_type = - std::integral_constant::value>; +constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( + I dividend, I divisor) noexcept +{ + using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline std::enable_if_t::value, bool> -is_a_power_of_two(I val) noexcept { +constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( + I val) noexcept +{ return ((val - 1) & val) == 0; } @@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept { * @return Absolute value if value type is signed. */ template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return std::abs(value); } // Unsigned type just returns itself. template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return value; } diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index b2302836bc..a2e29952d7 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -42,26 +42,25 @@ namespace label { * \param [in] stream cuda stream */ template -int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, - cudaStream_t stream) { +int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) +{ rmm::device_scalar d_num_selected(stream); rmm::device_uvector workspace(n, stream); - size_t bytes = 0; + size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique( + NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); rmm::device_uvector cub_storage(bytes, stream); // Select Unique classes - cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), - n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(), - workspace.data(), d_num_selected.data(), n); + cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), n); + cub::DeviceSelect::Unique( + cub_storage.data(), bytes, workspace.data(), workspace.data(), d_num_selected.data(), n); int n_unique = d_num_selected.value(stream); // Copy unique classes to output @@ -90,16 +89,17 @@ int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, * \param [in] stream cuda stream */ template -void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, - value_t *y_out, int idx, cudaStream_t stream) { +void getOvrlabels( + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) +{ ASSERT(idx < n_classes, "Parameter idx should not be larger than the number " "of classes"); raft::linalg::unaryOp( - y_out, y, n, - [idx, y_unique] __device__(value_t y) { - return y == y_unique[idx] ? +1 : -1; - }, + y_out, + y, + n, + [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -108,9 +108,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, - Type *out, size_t N, Lambda filter_op, - bool zero_based = false) { +__global__ void map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) +{ int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (!filter_op(in[tid])) { @@ -125,27 +130,28 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - Lambda filter_op, bool zero_based = false) { +void make_monotonic( + Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) +{ static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); @@ -159,25 +165,25 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - bool zero_based = false) { +void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) +{ make_monotonic( out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index bed74581a2..1ee0659b0d 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, value_idx* __restrict__ R, - const bool* __restrict__ mask, bool* __restrict__ m, - value_idx N) { + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (__ldg((char*)mask + tid)) { @@ -65,15 +67,17 @@ template __global__ void __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, value_idx N, - value_idx MAX_LABEL) { + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { // Note: labels are from 1 to N - value_idx la = labels_a[tid]; - value_idx lb = __ldg(labels_b + tid); - value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; - value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; + value_idx la = labels_a[tid]; + value_idx lb = __ldg(labels_b + tid); + value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; + value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; labels_a[tid] = min(ra, rb); } } @@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X) * @param[in] stream CUDA stream */ template -void merge_labels(value_idx* labels_a, const value_idx* labels_b, - const bool* mask, value_idx* R, bool* m, value_idx N, - cudaStream_t stream) { +void merge_labels(value_idx* labels_a, + const value_idx* labels_b, + const bool* mask, + value_idx* R, + bool* m, + value_idx N, + cudaStream_t stream) +{ dim3 blocks(raft::ceildiv(N, value_idx(TPB_X))); dim3 threads(TPB_X); value_idx MAX_LABEL = std::numeric_limits::max(); diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h index ed545b7198..e488dc528f 100644 --- a/cpp/include/raft/lap/d_structs.h +++ b/cpp/include/raft/lap/d_structs.h @@ -26,18 +26,18 @@ template struct Vertices { - vertex_t *row_assignments; - vertex_t *col_assignments; - int *row_covers; - int *col_covers; - weight_t *row_duals; - weight_t *col_duals; - weight_t *col_slacks; + vertex_t* row_assignments; + vertex_t* col_assignments; + int* row_covers; + int* col_covers; + weight_t* row_duals; + weight_t* col_duals; + weight_t* col_slacks; }; template struct VertexData { - vertex_t *parents; - vertex_t *children; - int *is_visited; + vertex_t* parents; + vertex_t* children; + int* is_visited; }; diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index f64afb3549..42b898ebff 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -39,12 +39,12 @@ class LinearAssignmentProblem { vertex_t batchsize_; weight_t epsilon_; - weight_t const *d_costs_; + weight_t const* d_costs_; Vertices d_vertices_dev; VertexData d_row_data_dev, d_col_data_dev; - raft::handle_t const &handle_; + raft::handle_t const& handle_; rmm::device_uvector row_covers_v; rmm::device_uvector col_covers_v; rmm::device_uvector row_duals_v; @@ -60,8 +60,10 @@ class LinearAssignmentProblem { rmm::device_uvector obj_val_dual_v; public: - LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, - vertex_t batchsize, weight_t epsilon) + LinearAssignmentProblem(raft::handle_t const& handle, + vertex_t size, + vertex_t batchsize, + weight_t epsilon) : handle_(handle), size_(size), batchsize_(batchsize), @@ -79,11 +81,13 @@ class LinearAssignmentProblem { row_children_v(0, handle_.get_stream()), col_children_v(0, handle_.get_stream()), obj_val_primal_v(0, handle_.get_stream()), - obj_val_dual_v(0, handle_.get_stream()) {} + obj_val_dual_v(0, handle_.get_stream()) + { + } // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, - vertex_t *d_col_assignment) { + void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) + { initializeDevice(); d_vertices_dev.row_assignments = d_row_assignment; @@ -95,27 +99,13 @@ class LinearAssignmentProblem { while (step != 100) { switch (step) { - case 0: - step = hungarianStep0(); - break; - case 1: - step = hungarianStep1(); - break; - case 2: - step = hungarianStep2(); - break; - case 3: - step = hungarianStep3(); - break; - case 4: - step = hungarianStep4(); - break; - case 5: - step = hungarianStep5(); - break; - case 6: - step = hungarianStep6(); - break; + case 0: step = hungarianStep0(); break; + case 1: step = hungarianStep1(); break; + case 2: step = hungarianStep2(); break; + case 3: step = hungarianStep3(); break; + case 4: step = hungarianStep4(); break; + case 5: step = hungarianStep5(); break; + case 6: step = hungarianStep6(); break; } } @@ -123,36 +113,39 @@ class LinearAssignmentProblem { } // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const { + std::pair getRowDualVector(int spId) const + { return std::make_pair(row_duals_v.data() + spId * size_, size_); } // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) { + std::pair getColDualVector(int spId) + { return std::make_pair(col_duals_v.data() + spId * size_, size_); } // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) { + weight_t getPrimalObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) { + weight_t getDualObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } private: // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() { + void initializeDevice() + { cudaStream_t stream = handle_.get_stream(); row_covers_v.resize(batchsize_ * size_, stream); col_covers_v.resize(batchsize_ * size_, stream); @@ -171,39 +164,36 @@ class LinearAssignmentProblem { d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); d_vertices_dev.col_slacks = col_slacks_v.data(); d_row_data_dev.is_visited = row_is_visited_v.data(); d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), - int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), - int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), - weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), - weight_t{0}); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, - size_); + int hungarianStep0() + { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); return 1; } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() { - detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev, - batchsize_, size_, epsilon_); + int hungarianStep1() + { + detail::computeInitialAssignments( + handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); int next = 2; @@ -219,10 +209,10 @@ class LinearAssignmentProblem { } // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() { - int cover_count = - detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + int hungarianStep2() + { + int cover_count = detail::computeRowCovers( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); int next = (cover_count == batchsize_ * size_) ? 6 : 3; @@ -230,7 +220,8 @@ class LinearAssignmentProblem { } // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() { + int hungarianStep3() + { int next; rmm::device_scalar flag_v(handle_.get_stream()); @@ -238,8 +229,14 @@ class LinearAssignmentProblem { bool h_flag = false; flag_v.set_value_async(h_flag, handle_.get_stream()); - detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, flag_v.data(), batchsize_, size_, + detail::executeZeroCover(handle_, + d_costs_, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + flag_v.data(), + batchsize_, + size_, epsilon_); h_flag = flag_v.value(handle_.get_stream()); @@ -250,31 +247,36 @@ class LinearAssignmentProblem { } // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, - size_); + int hungarianStep4() + { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + detail::augmentationPass( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); return 2; } // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() { - detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, - batchsize_, size_, epsilon_); + int hungarianStep5() + { + detail::dualUpdate( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); return 3; } // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() { - detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_, - d_vertices_dev.row_assignments, batchsize_, size_); + int hungarianStep6() + { + detail::calcObjValPrimal(handle_, + obj_val_primal_v.data(), + d_costs_, + d_vertices_dev.row_assignments, + batchsize_, + size_); - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, - batchsize_, size_); + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); return 100; } diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 830940f0ec..ab4aa2df59 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -45,20 +45,26 @@ const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; // Function for calculating grid and block dimensions from the given input size. -inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateLinearDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX * BLOCKDIMY; int value = size / threads_per_block.x; if (size % threads_per_block.x > 0) value++; - total_blocks = value; + total_blocks = value; blocks_per_grid.x = value; } // Function for calculating grid and block dimensions from the given input size for square grid. -inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateSquareDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -67,15 +73,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX); int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY); - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } -// Function for calculating grid and block dimensions from the given input size for rectangular grid. -inline void calculateRectangularDims(dim3 &blocks_per_grid, - dim3 &threads_per_block, int &total_blocks, - int xsize, int ysize) { +// Function for calculating grid and block dimensions from the given input size for rectangular +// grid. +inline void calculateRectangularDims( + dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -85,16 +92,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid, int valuey = ysize / threads_per_block.y; if (ysize % threads_per_block.y > 0) valuey++; - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } template -inline void initialReduction(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices_dev, - int SP, vertex_t N) { +inline void initialReduction(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices_dev, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -102,24 +111,28 @@ inline void initialReduction(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_rowReduction<<>>( - d_costs, d_vertices_dev.row_duals, SP, N, - std::numeric_limits::max()); + kernel_rowReduction<<>>( + d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); - kernel_columnReduction<<>>( - d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N, + kernel_columnReduction<<>>( + d_costs, + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + SP, + N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); } template -inline void computeInitialAssignments(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices, - int SP, vertex_t N, weight_t epsilon) { +inline void computeInitialAssignments(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -137,21 +150,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeInitialAssignments<<>>( - d_costs, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(), - col_lock_v.data(), SP, N, epsilon); + kernel_computeInitialAssignments<<>>( + d_costs, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.row_assignments, + d_vertices.col_assignments, + row_lock_v.data(), + col_lock_v.data(), + SP, + N, + epsilon); CHECK_CUDA(handle.get_stream()); } // Function for finding row cover on individual devices. template -inline int computeRowCovers(raft::handle_t const &handle, - Vertices &d_vertices, - VertexData &d_row_data, - VertexData &d_col_data, int SP, - vertex_t N) { +inline int computeRowCovers(raft::handle_t const& handle, + Vertices& d_vertices, + VertexData& d_row_data, + VertexData& d_col_data, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -160,8 +181,7 @@ inline int computeRowCovers(raft::handle_t const &handle, thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0}); thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0}); - thrust::fill_n(thrust::device, d_vertices.col_slacks, size, - std::numeric_limits::max()); + thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits::max()); thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1}); @@ -171,25 +191,28 @@ inline int computeRowCovers(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeRowCovers<<>>( - d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, - SP, N); + kernel_computeRowCovers<<>>( + d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); - return thrust::reduce(thrust::device, d_vertices.row_covers, - d_vertices.row_covers + size); + return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size); } // Function for covering the zeros in uncovered rows and expanding the frontier. template -inline void coverZeroAndExpand( - raft::handle_t const &handle, weight_t const *d_costs_dev, - vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, weight_t epsilon) { +inline void coverZeroAndExpand(raft::handle_t const& handle, + weight_t const* d_costs_dev, + vertex_t const* d_rows_csr_neighbors, + vertex_t const* d_rows_csr_ptrs, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; @@ -197,20 +220,30 @@ inline void coverZeroAndExpand( raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_coverAndExpand<<>>( - d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev, - d_row_data_dev, d_col_data_dev, SP, N, epsilon); + kernel_coverAndExpand<<>>( + d_flag, + d_rows_csr_ptrs, + d_rows_csr_neighbors, + d_costs_dev, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + SP, + N, + epsilon); } template -inline vertex_t zeroCoverIteration(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, - weight_t epsilon) { +inline vertex_t zeroCoverIteration(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M; rmm::device_uvector csr_ptrs_v(0, handle.get_stream()); @@ -235,65 +268,85 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. - kernel_rowPredicateConstructionCSR<<>>( - predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, - N); + predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); - thrust::exclusive_scan(thrust::device, addresses_v.begin(), - addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (M > 0) { csr_neighbors_v.resize(M, handle.get_stream()); - kernel_rowScatterCSR<<>>( - predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(), - csr_ptrs_v.data(), M, SP, N); + kernel_rowScatterCSR<<>>( + predicates_v.data(), + addresses_v.data(), + csr_neighbors_v.data(), + csr_ptrs_v.data(), + M, + SP, + N); CHECK_CUDA(handle.get_stream()); } } if (M > 0) { - coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(), - csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + coverZeroAndExpand(handle, + d_costs_dev, + csr_neighbors_v.data(), + csr_ptrs_v.data(), + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + d_flag, + SP, + N, + epsilon); } return M; } -// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros. +// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending +// on the presence of uncovered zeros. template -inline void executeZeroCover(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, bool *d_flag, - int SP, vertex_t N, weight_t epsilon) { +inline void executeZeroCover(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M = 1; while (M > 0) { - M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + M = zeroCoverIteration( + handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon); } } // Function for executing reverse pass of the maximum matching. template -inline void reversePass(raft::handle_t const &handle, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, int N) { +inline void reversePass(raft::handle_t const& handle, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; std::size_t size = SP * N; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, size); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); rmm::device_uvector predicates_v(size, handle.get_stream()); rmm::device_uvector addresses_v(size, handle.get_stream()); @@ -302,18 +355,19 @@ inline void reversePass(raft::handle_t const &handle, thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size); CHECK_CUDA(handle.get_stream()); // calculate total number of vertices. - std::size_t csr_size = - thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); + std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (csr_size > 0) { int total_blocks_1 = 0; @@ -324,14 +378,12 @@ inline void reversePass(raft::handle_t const &handle, rmm::device_uvector elements_v(csr_size, handle.get_stream()); - kernel_augmentScatter<<>>( + kernel_augmentScatter<<>>( elements_v.data(), predicates_v.data(), addresses_v.data(), size); CHECK_CUDA(handle.get_stream()); - kernel_reverseTraversal<<>>( + kernel_reverseTraversal<<>>( elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size); CHECK_CUDA(handle.get_stream()); } @@ -339,16 +391,17 @@ inline void reversePass(raft::handle_t const &handle, // Function for executing augmentation pass of the maximum matching. template -inline void augmentationPass(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, - int N) { +inline void augmentationPass(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP * N); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); rmm::device_uvector predicates_v(SP * N, handle.get_stream()); rmm::device_uvector addresses_v(SP * N, handle.get_stream()); @@ -357,7 +410,9 @@ inline void augmentationPass(raft::handle_t const &handle, thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N); @@ -368,8 +423,8 @@ inline void augmentationPass(raft::handle_t const &handle, vertex_t row_ids_csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (row_ids_csr_size > 0) { int total_blocks_1 = 0; @@ -378,20 +433,20 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); - rmm::device_uvector elements_v(row_ids_csr_size, - handle.get_stream()); + rmm::device_uvector elements_v(row_ids_csr_size, handle.get_stream()); - kernel_augmentScatter<<>>( - elements_v.data(), predicates_v.data(), addresses_v.data(), - vertex_t{SP * N}); + kernel_augmentScatter<<>>( + elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N}); CHECK_CUDA(handle.get_stream()); - kernel_augmentation<<>>( - d_vertices_dev.row_assignments, d_vertices_dev.col_assignments, - elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N}, + kernel_augmentation<<>>( + d_vertices_dev.row_assignments, + d_vertices_dev.col_assignments, + elements_v.data(), + d_row_data_dev, + d_col_data_dev, + vertex_t{N}, row_ids_csr_size); CHECK_CUDA(handle.get_stream()); @@ -399,34 +454,45 @@ inline void augmentationPass(raft::handle_t const &handle, } template -inline void dualUpdate(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, vertex_t N, - weight_t epsilon) { +inline void dualUpdate(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks; rmm::device_scalar sp_min_v(handle.get_stream()); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); - kernel_dualUpdate_1<<>>( - sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP, - N, std::numeric_limits::max()); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + kernel_dualUpdate_1<<>>( + sp_min_v.data(), + d_vertices_dev.col_slacks, + d_vertices_dev.col_covers, + SP, + N, + std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_dualUpdate_2<<>>( - sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals, - d_vertices_dev.col_slacks, d_vertices_dev.row_covers, - d_vertices_dev.col_covers, d_row_data_dev.is_visited, - d_col_data_dev.parents, SP, N, std::numeric_limits::max(), + kernel_dualUpdate_2<<>>( + sp_min_v.data(), + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + d_vertices_dev.col_slacks, + d_vertices_dev.row_covers, + d_vertices_dev.col_covers, + d_row_data_dev.is_visited, + d_col_data_dev.parents, + SP, + N, + std::numeric_limits::max(), epsilon); CHECK_CUDA(handle.get_stream()); @@ -434,18 +500,19 @@ inline void dualUpdate(raft::handle_t const &handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, - Vertices &d_vertices_dev, int SP, - int N) { +inline void calcObjValDual(raft::handle_t const& handle, + weight_t* d_obj_val, + Vertices& d_vertices_dev, + int SP, + int N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValDual<<>>( + kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); CHECK_CUDA(handle.get_stream()); @@ -453,20 +520,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val, - weight_t const *d_costs, - vertex_t const *d_row_assignments, int SP, - vertex_t N) { +inline void calcObjValPrimal(raft::handle_t const& handle, + weight_t* d_obj_val, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValPrimal<<>>(d_obj_val, d_costs, - d_row_assignments, SP, N); + kernel_calcObjValPrimal<<>>( + d_obj_val, d_costs, d_row_assignments, SP, N); CHECK_CUDA(handle.get_stream()); } diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 14ad877aa4..328cbf3e74 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -45,42 +45,57 @@ const int AUGMENT{4}; const int MODIFIED{5}; template -bool __device__ near_zero(weight_t w, weight_t epsilon) { +bool __device__ near_zero(weight_t w, weight_t epsilon) +{ return ((w > -epsilon) && (w < epsilon)); } template <> -bool __device__ near_zero(int32_t w, int32_t epsilon) { +bool __device__ near_zero(int32_t w, int32_t epsilon) +{ return (w == 0); } template <> -bool __device__ near_zero(int64_t w, int64_t epsilon) { +bool __device__ near_zero(int64_t w, int64_t epsilon) +{ return (w == 0); } -// Device function for traversing the neighbors from start pointer to end pointer and updating the covers. -// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution. +// Device function for traversing the neighbors from start pointer to end pointer and updating the +// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of +// Step 4 execution. template -__device__ void cover_and_expand_row( - weight_t const *d_elements, weight_t const *d_row_duals, - weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers, - int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag, - vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited, - int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N, - weight_t epsilon) { +__device__ void cover_and_expand_row(weight_t const* d_elements, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + weight_t* d_col_slacks, + int* d_row_covers, + int* d_col_covers, + vertex_t const* d_col_assignments, + bool* d_flag, + vertex_t* d_row_parents, + vertex_t* d_col_parents, + int* d_row_visited, + int* d_col_visited, + vertex_t rowid, + int spid, + int colid, + vertex_t N, + weight_t epsilon) +{ int ROWID = spid * N + rowid; int COLID = spid * N + colid; - weight_t slack = d_elements[spid * N * N + rowid * N + colid] - - d_row_duals[ROWID] - d_col_duals[COLID]; + weight_t slack = + d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID]; int nxt_rowid = d_col_assignments[COLID]; int NXT_ROWID = spid * N + nxt_rowid; if (rowid != nxt_rowid && d_col_covers[COLID] == 0) { if (slack < d_col_slacks[COLID]) { - d_col_slacks[COLID] = slack; + d_col_slacks[COLID] = slack; d_col_parents[COLID] = ROWID; } @@ -89,13 +104,12 @@ __device__ void cover_and_expand_row( d_row_parents[NXT_ROWID] = COLID; // update parent info d_row_covers[NXT_ROWID] = 0; - d_col_covers[COLID] = 1; + d_col_covers[COLID] = 1; - if (d_row_visited[NXT_ROWID] != VISITED) - d_row_visited[NXT_ROWID] = ACTIVE; + if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE; } else { d_col_visited[COLID] = REVERSE; - *d_flag = true; + *d_flag = true; } } } @@ -104,28 +118,34 @@ __device__ void cover_and_expand_row( // Device function for traversing an alternating path from unassigned row to unassigned column. template -__device__ void __reverse_traversal( - int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children, - vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) { +__device__ void __reverse_traversal(int* d_row_visited, + vertex_t* d_row_children, + vertex_t* d_col_children, + vertex_t const* d_row_parents, + vertex_t const* d_col_parents, + int cur_colid) +{ int cur_rowid = -1; while (cur_colid != -1) { d_col_children[cur_colid] = cur_rowid; - cur_rowid = d_col_parents[cur_colid]; + cur_rowid = d_col_parents[cur_colid]; d_row_children[cur_rowid] = cur_colid; - cur_colid = d_row_parents[cur_rowid]; + cur_colid = d_row_parents[cur_rowid]; } d_row_visited[cur_rowid] = AUGMENT; } // Device function for augmenting the alternating path from unassigned column to unassigned row. template -__device__ void __augment(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_children, - vertex_t const *d_col_children, vertex_t cur_rowid, - vertex_t N) { +__device__ void __augment(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_children, + vertex_t const* d_col_children, + vertex_t cur_rowid, + vertex_t N) +{ int cur_colid = -1; while (cur_rowid != -1) { @@ -142,20 +162,18 @@ __device__ void __augment(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_rowReduction(weight_t const *d_costs, - weight_t *d_row_duals, int SP, vertex_t N, - weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; - int rowid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void kernel_rowReduction( + weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; + int rowid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && rowid < N) { for (int colid = 0; colid < N; colid++) { weight_t slack = d_costs[spid * N * N + rowid * N + colid]; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_row_duals[spid * N + rowid] = min; @@ -166,25 +184,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_columnReduction(weight_t const *d_costs, - weight_t const *d_row_duals, - weight_t *d_col_duals, int SP, - vertex_t N, weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_columnReduction(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t* d_col_duals, + int SP, + vertex_t N, + weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && colid < N) { for (int rowid = 0; rowid < N; rowid++) { - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[spid * N + rowid]; weight_t slack = cost - row_dual; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_col_duals[spid * N + colid] = min; @@ -193,12 +212,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs, // Kernel for calculating initial assignments. template -__global__ void kernel_computeInitialAssignments( - weight_t const *d_costs, weight_t const *d_row_duals, - weight_t const *d_col_duals, vertex_t *d_row_assignments, - vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeInitialAssignments(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + int* d_row_lock, + int* d_col_lock, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && colid < N) { @@ -210,15 +235,15 @@ __global__ void kernel_computeInitialAssignments( if (d_col_lock[overall_colid] == 1) break; - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[overall_rowid]; - weight_t slack = cost - row_dual - col_dual; + weight_t slack = cost - row_dual - col_dual; if (near_zero(slack, epsilon)) { if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) { d_row_assignments[overall_rowid] = colid; d_col_assignments[overall_colid] = rowid; - d_col_lock[overall_colid] = 1; + d_col_lock[overall_colid] = 1; } } } @@ -227,10 +252,10 @@ __global__ void kernel_computeInitialAssignments( // Kernel for populating the cover arrays and initializing alternating tree. template -__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, - int *d_row_covers, int *d_row_visited, - int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeRowCovers( + vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -246,11 +271,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, // Kernel for populating the predicate matrix for edges in row major format. template -__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, - vertex_t *d_addresses, - int *d_row_visited, int SP, - vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowPredicateConstructionCSR( + bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -258,130 +282,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, if (d_row_visited[index] == ACTIVE) { d_predicates[index] = true; - d_addresses[index] = 1; + d_addresses[index] = 1; } else { d_predicates[index] = false; - d_addresses[index] = 0; + d_addresses[index] = 0; } } } // Kernel for scattering the edges based on the scatter addresses. template -__global__ void kernel_rowScatterCSR(bool const *d_predicates, - vertex_t const *d_addresses, - vertex_t *d_neighbors, vertex_t *d_ptrs, - vertex_t M, int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowScatterCSR(bool const* d_predicates, + vertex_t const* d_addresses, + vertex_t* d_neighbors, + vertex_t* d_ptrs, + vertex_t M, + int SP, + vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { int index = spid * N + rowid; - bool predicate = d_predicates[index]; + bool predicate = d_predicates[index]; vertex_t compid = d_addresses[index]; - if (predicate) { - d_neighbors[compid] = rowid; - } + if (predicate) { d_neighbors[compid] = rowid; } if (rowid == 0) { d_ptrs[spid] = compid; - d_ptrs[SP] = M; + d_ptrs[SP] = M; } } } // Kernel for finding the minimum zero cover. template -__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs, - vertex_t const *d_neighbors, - weight_t const *d_elements, +__global__ void kernel_coverAndExpand(bool* d_flag, + vertex_t const* d_ptrs, + vertex_t const* d_neighbors, + weight_t const* d_elements, Vertices d_vertices, VertexData d_row_data, - VertexData d_col_data, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; + VertexData d_col_data, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; // Load values into local memory if (spid < SP && colid < N) { thrust::for_each( - thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1], - [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, - epsilon] __device__(vertex_t rowid) { - cover_and_expand_row( - d_elements, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers, - d_vertices.col_assignments, d_flag, d_row_data.parents, - d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited, - rowid, spid, colid, N, epsilon); + thrust::seq, + d_neighbors + d_ptrs[spid], + d_neighbors + d_ptrs[spid + 1], + [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__( + vertex_t rowid) { + cover_and_expand_row(d_elements, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.col_slacks, + d_vertices.row_covers, + d_vertices.col_covers, + d_vertices.col_assignments, + d_flag, + d_row_data.parents, + d_col_data.parents, + d_row_data.is_visited, + d_col_data.is_visited, + rowid, + spid, + colid, + N, + epsilon); }); } } // Kernel for constructing the predicates for reverse pass or augmentation candidates. template -__global__ void kernel_augmentPredicateConstruction(bool *d_predicates, - vertex_t *d_addresses, - int *d_visited, int size) { +__global__ void kernel_augmentPredicateConstruction(bool* d_predicates, + vertex_t* d_addresses, + int* d_visited, + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { int visited = d_visited[id]; if ((visited == REVERSE) || (visited == AUGMENT)) { d_predicates[id] = true; - d_addresses[id] = 1; + d_addresses[id] = 1; } else { d_predicates[id] = false; - d_addresses[id] = 0; + d_addresses[id] = 0; } } } // Kernel for scattering the vertices based on the scatter addresses. template -__global__ void kernel_augmentScatter(vertex_t *d_elements, - bool const *d_predicates, - vertex_t const *d_addresses, - std::size_t size) { +__global__ void kernel_augmentScatter(vertex_t* d_elements, + bool const* d_predicates, + vertex_t const* d_addresses, + std::size_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - if (d_predicates[id]) { - d_elements[d_addresses[id]] = id; - } + if (d_predicates[id]) { d_elements[d_addresses[id]] = id; } } } // Kernel for executing the reverse pass of the maximum matching algorithm. template -__global__ void kernel_reverseTraversal(vertex_t *d_elements, +__global__ void kernel_reverseTraversal(vertex_t* d_elements, VertexData d_row_data, VertexData d_col_data, - int size) { + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __reverse_traversal(d_row_data.is_visited, d_row_data.children, - d_col_data.children, d_row_data.parents, - d_col_data.parents, d_elements[id]); + __reverse_traversal(d_row_data.is_visited, + d_row_data.children, + d_col_data.children, + d_row_data.parents, + d_col_data.parents, + d_elements[id]); } } // Kernel for executing the augmentation pass of the maximum matching algorithm. template -__global__ void kernel_augmentation(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_elements, +__global__ void kernel_augmentation(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_elements, VertexData d_row_data, - VertexData d_col_data, vertex_t N, - vertex_t size) { + VertexData d_col_data, + vertex_t N, + vertex_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __augment(d_row_assignments, d_col_assignments, d_row_data.children, - d_col_data.children, d_row_elements[id], N); + __augment(d_row_assignments, + d_col_assignments, + d_row_data.children, + d_col_data.children, + d_row_elements[id], + N); } } @@ -389,18 +443,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_1(weight_t *d_sp_min, - weight_t const *d_col_slacks, - int const *d_col_covers, int SP, vertex_t N, - weight_t infinity) { +__global__ void kernel_dualUpdate_1(weight_t* d_sp_min, + weight_t const* d_col_slacks, + int const* d_col_covers, + int SP, + vertex_t N, + weight_t infinity) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { weight_t min = infinity; for (int colid = 0; colid < N; colid++) { - int index = spid * N + colid; + int index = spid * N + colid; weight_t slack = d_col_slacks[index]; - int col_cover = d_col_covers[index]; + int col_cover = d_col_covers[index]; if (col_cover == 0) if (slack < min) min = slack; @@ -414,21 +471,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_2( - weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals, - weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers, - int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N, - weight_t infinity, weight_t epsilon) { +__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, + weight_t* d_row_duals, + weight_t* d_col_duals, + weight_t* d_col_slacks, + int const* d_row_covers, + int const* d_col_covers, + int* d_row_visited, + vertex_t* d_col_parents, + int SP, + vertex_t N, + weight_t infinity, + weight_t epsilon) +{ int spid = blockIdx.y * blockDim.y + threadIdx.y; - int id = blockIdx.x * blockDim.x + threadIdx.x; + int id = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && id < N) { int index = spid * N + id; if (d_sp_min[spid] < infinity) { weight_t theta = d_sp_min[spid]; - int row_cover = d_row_covers[index]; - int col_cover = d_col_covers[index]; + int row_cover = d_row_covers[index]; + int col_cover = d_col_covers[index]; if (row_cover == 0) // Row vertex is reachable from source. d_row_duals[index] += theta; @@ -450,10 +515,12 @@ __global__ void kernel_dualUpdate_2( // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, - weight_t const *d_row_duals, - weight_t const *d_col_duals, int SP, - vertex_t N) { +__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { @@ -468,10 +535,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal, - weight_t const *d_costs, - vertex_t const *d_row_assignments, - int SP, vertex_t N) { +__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 7a454f64e2..11d3174951 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -37,8 +37,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; unaryOp(out, in, len, op, stream); } @@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void add(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @param stream cuda stream */ template -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 940d786e87..a49a433941 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -22,10 +22,10 @@ namespace raft { namespace linalg { -template -__global__ void binaryOpKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len, Lambda op) { +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a, b; @@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1, c.store(out, idx); } -template -void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, - IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, /** * @brief Checks if addresses are aligned on N bytes */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, - uint64_t N) { +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } @@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { - constexpr auto maxSize = - sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 8)) { + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 4)) { + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 2)) { + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, - op, stream); + binaryOpImpl(out, in1, in2, len, op, stream); } } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index d6d064c20e..4b58133ac5 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -122,9 +122,16 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; - math_t *A_new; - math_t *A_row; + math_t* A_new; + math_t* A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + CUBLAS_CHECK( + raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); @@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index ef983ff3d0..7e0744f98a 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -26,18 +26,27 @@ namespace linalg { // of the matrix, i.e. reduce along rows for row major or reduce along columns // for column major layout. Kernel does an inplace reduction adding to original // values of dots. -template -__global__ void coalescedReductionKernel(OutType *dots, const InType *data, - int D, int N, OutType init, +template +__global__ void coalescedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, MainLambda main_op, ReduceLambda reduce_op, FinalLambda final_op, - bool inplace = false) { + bool inplace = false) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; + IdxType rowStart = blockIdx.x * D; for (IdxType i = threadIdx.x; i < D; i += TPB) { IdxType idx = rowStart + i; thread_data = reduce_op(thread_data, main_op(data[idx], i)); @@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType *dots, const InType *data, int D, int N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ // One block per reduction // Efficient only for large leading dimensions if (D <= 32) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 64) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 128) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index e6ff8a49ce..817bfeab5c 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -55,8 +55,7 @@ namespace linalg { * thread block. This also determines the number of threads per * thread block */ -template +template struct KernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -101,8 +100,7 @@ struct KernelPolicy { }; // struct KernelPolicy -template +template struct ColKernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -151,7 +149,8 @@ struct ColKernelPolicy { * @{ */ template -struct Policy4x4 {}; +struct Policy4x4 { +}; template struct Policy4x4 { @@ -171,7 +170,8 @@ struct Policy4x4 { * @{ */ template -struct Policy2x8 {}; +struct Policy2x8 { +}; template struct Policy2x8 { @@ -201,8 +201,7 @@ struct Policy2x8 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template +template struct Contractions_NT { protected: typedef Policy P; @@ -268,8 +267,7 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -286,7 +284,9 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) {} + pageRd(0) + { + } /** * @brief Ctor @@ -297,8 +297,15 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + DI Contractions_NT(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + char* _smem) : m(_m), n(_n), k(_k), @@ -312,17 +319,18 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) { + pageRd(0) + { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } @@ -331,7 +339,8 @@ struct Contractions_NT { * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ - DI void ldgXY(IdxT kidx) { + DI void ldgXY(IdxT kidx) + { ldgX(kidx); ldgY(kidx); } @@ -340,7 +349,8 @@ struct Contractions_NT { * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ - DI void stsXY() { + DI void stsXY() + { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } @@ -349,13 +359,15 @@ struct Contractions_NT { * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ - DI void ldsXY(int kidx) { + DI void ldsXY(int kidx) + { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) { + DI void ldgX(IdxT kidx) + { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -372,11 +384,10 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -388,7 +399,8 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) { + DI void ldgY(IdxT kidx) + { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -408,8 +420,7 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -421,7 +432,8 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) { + DI void stsX(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -429,7 +441,8 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) { + DI void stsY(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -437,7 +450,8 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) { + DI void ldsX(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -456,7 +470,8 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) { + DI void ldsY(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 1be14a550d..3616d54506 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,8 +25,7 @@ #include #define _CUBLAS_ERR_TO_STR(err) \ - case err: \ - return #err + case err: return #err namespace raft { @@ -34,15 +33,15 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const *const message) : raft::exception(message) {} - explicit cublas_error(std::string const &message) - : raft::exception(message) {} + explicit cublas_error(char const* const message) : raft::exception(message) {} + explicit cublas_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cublas_error_to_string(cublasStatus_t err) { +inline const char* cublas_error_to_string(cublasStatus_t err) +{ switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "CUBLAS_STATUS_UNKNOWN"; + default: return "CUBLAS_STATUS_UNKNOWN"; }; } @@ -71,29 +69,34 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ - do { \ - cublasStatus_t const status = (call); \ - if (CUBLAS_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "cuBLAS error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ } while (0) /** FIXME: temporary alias for cuML compatibility */ #define CUBLAS_CHECK(call) CUBLAS_TRY(call) /** check for cublas runtime API errors but do not assert */ -#define CUBLAS_CHECK_NO_THROW(call) \ - do { \ - cublasStatus_t err = call; \ - if (err != CUBLAS_STATUS_SUCCESS) { \ - CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::detail::cublas_error_to_string(err)); \ - } \ +#define CUBLAS_CHECK_NO_THROW(call) \ + do { \ + cublasStatus_t err = call; \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::linalg::detail::cublas_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -104,22 +107,39 @@ namespace linalg { * @{ */ template -cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, - const T *x, int incx, T *y, int incy, +cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const T* alpha, + const T* x, + int incx, + T* y, + int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const float *alpha, const float *x, int incx, - float *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const float* alpha, + const float* x, + int incx, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const double *alpha, const double *x, int incx, - double *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const double* alpha, + const double* x, + int incx, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } @@ -130,21 +150,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, * @{ */ template -cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y, - int incy, cudaStream_t stream); +cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -156,20 +176,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, * @{ */ template -cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, - T *y, int incy, cudaStream_t stream); +cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } @@ -180,31 +200,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, * @{ */ template -cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, - int m, int n, const T *alfa, const T *A, int lda, - const T *x, int incx, const T *beta, T *y, int incy, +cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* x, + int incx, + const T* beta, + T* y, + int incy, cudaStream_t stream); template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const float *alfa, const float *A, int lda, - const float *x, int incx, const float *beta, - float *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* x, + int incx, + const float* beta, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const double *alfa, const double *A, int lda, - const double *x, int incx, const double *beta, - double *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* x, + int incx, + const double* beta, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -213,23 +258,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, - const T *x, int incx, const T *y, int incy, T *A, - int lda, cudaStream_t stream); +cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const T* alpha, + const T* x, + int incx, + const T* y, + int incy, + T* A, + int lda, + cudaStream_t stream); template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const float *alpha, const float *x, int incx, - const float *y, int incy, float *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const float* alpha, + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const double *alpha, const double *x, int incx, - const double *y, int incy, double *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const double* alpha, + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -240,34 +309,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, * @{ */ template -cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const T *alfa, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc, +cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + int k, + const T* alfa, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const float *alfa, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const float* alfa, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const double *alfa, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const double* alfa, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -278,38 +375,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, template cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, - const T *const Aarray[], // NOLINT - int lda, const T *const Barray[], // NOLINT - int ldb, const T *beta, - T *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream); + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray[], // NOLINT + int lda, + const T* const Barray[], // NOLINT + int ldb, + const T* beta, + T* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream); template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, - const float *const Aarray[], // NOLINT - int lda, const float *const Barray[], // NOLINT - int ldb, const float *beta, float *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray[], // NOLINT + int lda, + const float* const Barray[], // NOLINT + int ldb, + const float* beta, + float* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasSgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, - const double *const Aarray[], // NOLINT - int lda, const double *const Barray[], // NOLINT - int ldb, const double *beta, double *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray[], // NOLINT + int lda, + const double* const Barray[], // NOLINT + int ldb, + const double* beta, + double* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasDgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } /** @} */ @@ -319,36 +471,110 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT */ template cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const T *alpha, const T *const Aarray, int lda, - int64_t strideA, const T *const Barray, int ldb, int64_t strideB, - const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray, + int lda, + int64_t strideA, + const T* const Barray, + int ldb, + int64_t strideB, + const T* beta, + T* Carray, + int ldc, + int64_t strideC, + int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, const float *const Aarray, int lda, - int64_t strideA, const float *const Barray, int ldb, int64_t strideB, - const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray, + int lda, + int64_t strideA, + const float* const Barray, + int ldb, + int64_t strideB, + const float* beta, + float* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasSgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, const double *const Aarray, int lda, - int64_t strideA, const double *const Barray, int ldb, int64_t strideB, - const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray, + int lda, + int64_t strideA, + const double* const Barray, + int ldb, + int64_t strideB, + const double* beta, + double* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasDgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } /** @} */ @@ -358,51 +584,85 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT */ template -cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT - T *const A[], // NOLINT - int lda, int *P, int *info, int batchSize, +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, + int n, // NOLINT + T* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, float *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + float* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, double *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + double* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } template -cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT - const T *const A[], // NOLINT - int lda, const int *P, - T *const C[], // NOLINT - int ldc, int *info, int batchSize, +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, + int n, // NOLINT + const T* const A[], // NOLINT + int lda, + const int* P, + T* const C[], // NOLINT + int ldc, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const float *const A[], // NOLINT - int lda, const int *P, float *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const float* const A[], // NOLINT + int lda, + const int* P, + float* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const double *const A[], // NOLINT - int lda, const int *P, double *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const double* const A[], // NOLINT + int lda, + const int* P, + double* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -416,34 +676,57 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT template inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, T *Aarray[], // NOLINT - int lda, T *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream); + cublasOperation_t trans, + int m, + int n, + int nrhs, + T* Aarray[], // NOLINT + int lda, + T* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream); template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, float *Aarray[], // NOLINT - int lda, float *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* Aarray[], // NOLINT + int lda, + float* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasSgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, double *Aarray[], // NOLINT - int lda, double *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* Aarray[], // NOLINT + int lda, + double* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasDgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } /** @} */ @@ -453,33 +736,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT * @{ */ template -cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, const T *alfa, - const T *A, int lda, const T *beta, const T *B, - int ldb, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* beta, + const T* B, + int ldb, + T* C, + int ldc, + cudaStream_t stream); template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const float *alfa, const float *A, int lda, - const float *beta, const float *B, int ldb, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* beta, + const float* B, + int ldb, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const double *alfa, const double *A, int lda, - const double *beta, const double *B, int ldb, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* beta, + const double* B, + int ldb, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -488,31 +797,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, const T *alpha, - const T *A, int lda, const T *B, int ldb, - const T *beta, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const T* alpha, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const float *alpha, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const double *alpha, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -521,27 +858,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, const T *alpha, - const T *A, int lda, const T *beta, T *C, int ldc, +cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const T* alpha, + const T* A, + int lda, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const float *alpha, const float *A, int lda, - const float *beta, float *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const double *alpha, const double *A, int lda, - const double *beta, double *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -552,52 +913,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, * @{ */ template -cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, - T *result, cudaStream_t stream); +cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream); template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, - int incx, float *result, cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, - int incx, double *result, - cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ template -cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, const T *alpha, - const T *A, int lda, T *B, int ldb, +cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const T* alpha, + const T* A, + int lda, + T* B, + int ldb, cudaStream_t stream); template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const float *alpha, const float *A, int lda, - float *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, + const float* A, + int lda, + float* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const double *alpha, const double *A, int lda, - double *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, + const double* A, + int lda, + double* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } /** @@ -605,21 +991,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, - const T *y, int incy, T *result, cudaStream_t stream); +cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const T* x, + int incx, + const T* y, + int incy, + T* result, + cudaStream_t stream); template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, - int incx, const float *y, int incy, - float *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, - int incx, const double *y, int incy, - double *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } @@ -639,7 +1043,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, // template<> inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } @@ -650,21 +1055,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, - int incx, cudaStream_t stream); +cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream); template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const float *alpha, float *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const double *alpha, double *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 6aa5e74455..85f2740647 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -24,8 +24,7 @@ #include #define _CUSOLVER_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; namespace raft { @@ -33,16 +32,15 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const *const message) - : raft::exception(message) {} - explicit cusolver_error(std::string const &message) - : raft::exception(message) {} + explicit cusolver_error(char const* const message) : raft::exception(message) {} + explicit cusolver_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cusolver_error_to_string(cusolverStatus_t err) { +inline const char* cusolver_error_to_string(cusolverStatus_t err) +{ switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); @@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); - default: - return "CUSOLVER_STATUS_UNKNOWN"; + default: return "CUSOLVER_STATUS_UNKNOWN"; }; } @@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::linalg::detail::cusolver_error_to_string(status)); \ throw raft::cusolver_error(msg); \ } \ @@ -107,42 +107,76 @@ namespace linalg { * @{ */ template -cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *Workspace, - int *devIpiv, int *devInfo, +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* Workspace, + int* devIpiv, + int* devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, int nrhs, - const T *A, int lda, const int *devIpiv, T *B, - int ldb, int *devInfo, cudaStream_t stream); + cublasOperation_t trans, + int n, + int nrhs, + const T* A, + int lda, + const int* devIpiv, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const float *A, int lda, - const int *devIpiv, float *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const float* A, + int lda, + const int* devIpiv, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const double *A, int lda, - const int *devIpiv, double *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const double* A, + int lda, + const int* devIpiv, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork) +{ return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork) +{ return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *info, syevjInfo_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* info, + syevjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, float *A, int lda, float *W, float *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, double *A, int lda, double *W, double *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork, + syevjInfo_t params); template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork, - syevjInfo_t params) { - return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork, - syevjInfo_t params) { - return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *devInfo, cudaStream_t stream); + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, float *A, - int lda, float *W, float *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, double *A, - int lda, double *W, double *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT /** * @defgroup syevdx cusolver syevdx operations * @{ -*/ + */ template cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, - int il, int iu, int *h_meig, const float *W, int *lwork) { - return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + const float* W, + int* lwork) +{ + return cusolverDnSsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, const double *W, int *lwork) { - return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + const double* W, + int* lwork) +{ + return cusolverDnDsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, - int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnSsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnDsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ #endif @@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT */ template cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int* lwork) +{ if (std::is_same, float>::value) { return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { @@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT } template cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, - T *rwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* VT, + int ldvt, + T* work, + int lwork, + T* rwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, - float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* VT, + int ldvt, + float* work, + int lwork, + float* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnSgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, - double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* VT, + int ldvt, + double* work, + int lwork, + double* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnDgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, - int *lwork, gesvdjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const T* A, + int lda, + const T* S, + const T* U, + int ldu, + const T* V, + int ldv, + int* lwork, + gesvdjInfo_t params); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const float *A, int lda, const float *S, const float *U, int ldu, - const float *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const float* A, + int lda, + const float* S, + const float* U, + int ldu, + const float* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnSgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const double *A, int lda, const double *S, const double *U, int ldu, - const double *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const double* A, + int lda, + const double* S, + const double* U, + int ldu, + const double* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnDgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, - int *info, gesvdjInfo_t params, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* V, + int ldv, + T* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, - float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* V, + int ldv, + float* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnSgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, - double *work, int lwork, int *info, gesvdjInfo_t params, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* V, + int ldv, + double* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnDgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } /** @} */ @@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT */ template cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, - int *Lwork); + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, T *A, - int lda, T *Workspace, int Lwork, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, float *A, - int lda, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, double *A, - int lda, double *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const T *A, int lda, T *B, int ldb, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + int nrhs, + const T* A, + int lda, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const float *A, int lda, float *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const float* A, + int lda, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const double *A, int lda, double *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const double* A, + int lda, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT * @{ */ template -cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *TAU, T *Workspace, - int Lwork, int *devInfo, cudaStream_t stream); +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* TAU, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *TAU, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* TAU, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *TAU, double *Workspace, - int Lwork, int *devInfo, - cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* TAU, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, - T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + int m, + int n, + int k, + T* A, + int lda, + const T* tau, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, - const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + float* A, + int lda, + const float* tau, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, - const double *tau, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + double* A, + int lda, + const double* tau, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, - const T *TAU, int *lwork); + cusolverDnHandle_t handle, + int m, + int n, + int k, + const T* A, + int lda, + const T* TAU, + int* lwork); template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, - const float *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const float* A, + int lda, + const float* TAU, + int* lwork) +{ return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, - const double *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const double* A, + int lda, + const double* TAU, + int* lwork) +{ return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT - cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, - const T *tau, T *C, int ldc, T *work, - int lwork, int *devInfo, cudaStream_t stream); + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + T* C, + int ldc, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, float *C, - int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + float* C, + int ldc, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, double *C, - int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + double* C, + int ldc, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, - int *lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + const T* C, + int ldc, + int* lwork); template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, - const float *C, int ldc, int *lwork) { - return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + const float* C, + int ldc, + int* lwork) +{ + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, - const double *C, int ldc, int *lwork) { - return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + const double* C, + int ldc, + int* lwork) +{ + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes); template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpScsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpScsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpDcsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpDcsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, - void *pBuffer, cudaStream_t stream); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* b, + T* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const float *b, float *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* b, + float* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpScsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const double *b, double *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* b, + double* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpDcsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } /** @} */ @@ -726,66 +1332,165 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT */ template cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const T *A, int64_t lda, const T *W, - size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost, + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const T* A, + int64_t lda, + const T* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const float *A, int64_t lda, const float *W, - size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const float* A, + int64_t lda, + const float* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd_bufferSize( - handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, CUDA_R_32F, W, - CUDA_R_32F, workspaceInBytesOnDevice, workspaceInBytesOnHost); + return cusolverDnXsyevd_bufferSize(handle, + params, + jobz, + uplo, + n, + CUDA_R_32F, + A, + lda, + CUDA_R_32F, + W, + CUDA_R_32F, + workspaceInBytesOnDevice, + workspaceInBytesOnHost); } template <> inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const double *A, int64_t lda, - const double *W, size_t *workspaceInBytesOnDevice, - size_t *workspaceInBytesOnHost, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const double* A, + int64_t lda, + const double* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd_bufferSize( - handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, CUDA_R_64F, W, - CUDA_R_64F, workspaceInBytesOnDevice, workspaceInBytesOnHost); + return cusolverDnXsyevd_bufferSize(handle, + params, + jobz, + uplo, + n, + CUDA_R_64F, + A, + lda, + CUDA_R_64F, + W, + CUDA_R_64F, + workspaceInBytesOnDevice, + workspaceInBytesOnHost); } template cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, T *A, int64_t lda, T *W, T *bufferOnDevice, - size_t workspaceInBytesOnDevice, T *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + T* A, + int64_t lda, + T* W, + T* bufferOnDevice, + size_t workspaceInBytesOnDevice, + T* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, float *A, int64_t lda, float *W, - float *bufferOnDevice, size_t workspaceInBytesOnDevice, float *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + float* A, + int64_t lda, + float* W, + float* bufferOnDevice, + size_t workspaceInBytesOnDevice, + float* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, - CUDA_R_32F, W, CUDA_R_32F, bufferOnDevice, - workspaceInBytesOnDevice, bufferOnHost, - workspaceInBytesOnHost, info); + return cusolverDnXsyevd(handle, + params, + jobz, + uplo, + n, + CUDA_R_32F, + A, + lda, + CUDA_R_32F, + W, + CUDA_R_32F, + bufferOnDevice, + workspaceInBytesOnDevice, + bufferOnHost, + workspaceInBytesOnHost, + info); } template <> inline cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, double *A, int64_t lda, double *W, - double *bufferOnDevice, size_t workspaceInBytesOnDevice, double *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + double* A, + int64_t lda, + double* W, + double* bufferOnDevice, + size_t workspaceInBytesOnDevice, + double* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, - CUDA_R_64F, W, CUDA_R_64F, bufferOnDevice, - workspaceInBytesOnDevice, bufferOnHost, - workspaceInBytesOnHost, info); + return cusolverDnXsyevd(handle, + params, + jobz, + uplo, + n, + CUDA_R_64F, + A, + lda, + CUDA_R_64F, + W, + CUDA_R_64F, + bufferOnDevice, + workspaceInBytesOnDevice, + bufferOnHost, + workspaceInBytesOnHost, + info); } /** @} */ #endif diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index c848ac1f4b..562a3d8991 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index e141883b6c..288d379dac 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -29,25 +29,42 @@ namespace raft { namespace linalg { template -void eigDC_legacy(const raft::handle_t &handle, const math_t *in, - std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors, - math_t *eig_vals, cudaStream_t stream) { +void eigDC_legacy(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, in, - n_cols, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); auto dev_info = d_dev_info.value(stream); @@ -70,9 +87,14 @@ void eigDC_legacy(const raft::handle_t &handle, const math_t *in, * @{ */ template -void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, - std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream) { +void eigDC(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); #else @@ -82,11 +104,18 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params)); size_t workspaceDevice = 0; - size_t workspaceHost = 0; - CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize( - cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), eig_vectors, static_cast(n_cols), - eig_vals, &workspaceDevice, &workspaceHost, stream)); + size_t workspaceHost = 0; + CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + &workspaceDevice, + &workspaceHost, + stream)); rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); rmm::device_scalar d_dev_info(stream); @@ -94,11 +123,20 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnxsyevd( - cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), eig_vectors, static_cast(n_cols), - eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + d_work.data(), + workspaceDevice, + h_work.data(), + workspaceHost, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params)); @@ -128,38 +166,79 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; * @{ */ template -void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, - EigVecMemUsage memUsage, cudaStream_t stream) { +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), - math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, - d_work.data(), lwork, d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } CUDA_CHECK(cudaGetLastError()); @@ -170,11 +249,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, "This usually occurs when some of the features do not vary enough."); if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, - stream); + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream); } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, - n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin( + d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream); } } @@ -195,36 +273,54 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @{ */ template -void eigJacobi(const raft::handle_t &handle, const math_t *in, - std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors, - math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, - std::uint32_t sweeps = 15) { +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + std::uint32_t sweeps = 15) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - CUSOLVER_CHECK( - cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); + CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, - eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + &lwork, + syevj_params)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - dev_info.data(), syevj_params, stream)); + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + dev_info.data(), + syevj_params, + stream)); int executed_sweeps; - CUSOLVER_CHECK( - cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 1c6dee562d..097c3ac218 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -34,19 +34,17 @@ namespace linalg { * @{ */ template -void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); } template -void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); } /** @} */ @@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); } template -void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); } template -void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); } template -void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, + out, + in1, + in2, + len, [] __device__(InType a, InType b) { if (b == InType(0.0)) return InType(0.0); diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 0a4897cc0b..d5942b7446 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -43,35 +43,53 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, - cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, * @param beta scalar */ template -void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, - int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, - cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t trans_a, trans_b; @@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major // layout, trans_b needs to be CUBLAS_OP_N. trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; + ldb = isYColMajor == true ? _K : _N; - c = z; + c = z; ldc = _M; - M = _M; - N = _N; - K = _K; + M = _M; + N = _N; + K = _K; } else { // Result c is required in row major layout Thus we pick // a = y, b = x and c = a * b = y * x @@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // Set leading dimension appropriately ldb = isXColMajor == true ? _M : _K; - c = z; + c = z; ldc = _N; M = _N; @@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index 0be11a0301..ac0547e30a 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -26,14 +26,23 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows, - const int n_cols, const math_t *x, const int incx, math_t *y, - const int incy, const bool trans_a, const math_t alpha, - const math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* x, + const int incx, + math_t* y, + const int incy, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, - incx, &beta, y, incy, stream)); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); } /** @@ -53,9 +62,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, - const math_t alpha, const math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } @@ -72,11 +89,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } @@ -102,14 +125,22 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const int lda, const math_t *x, math_t *y, - const bool trans_a, const math_t alpha, const math_t beta, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, - 1, &beta, y, 1, stream)); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); } /** @@ -130,11 +161,18 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const int lda, const math_t *x, math_t *y, - const bool trans_a, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); } diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index 9944685a1f..41ef4d4641 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -37,7 +37,8 @@ namespace { * \param [in] stream cuda stream */ template -void range(T *out, int start, int end, cudaStream_t stream) { +void range(T* out, int start, int end, cudaStream_t stream) +{ thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); @@ -54,7 +55,8 @@ void range(T *out, int start, int end, cudaStream_t stream) { * \param [in] stream cuda stream */ template -void range(T *out, int n, cudaStream_t stream) { +void range(T* out, int n, cudaStream_t stream) +{ range(out, 0, n, stream); } } // unnamed namespace diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index b775a1f696..39089473e3 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -//for cmath: +// for cmath: #define _USE_MATH_DEFINES #include @@ -40,14 +40,14 @@ using namespace linalg; namespace spectral { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - float *outputPtr, size_t n, - float mean, float stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - double *outputPtr, size_t n, - double mean, double stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t *iter, index_type_t maxIter, value_type_t shift, - value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev) { +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -117,29 +122,28 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, - stream)); + CUBLAS_CHECK(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, - beta_host, stream)); + CUBLAS_CHECK(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), - 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -151,65 +155,121 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, - lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(value_type_t), cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, n, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, - alpha_host + (*iter - 1), stream)); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, - beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -217,7 +277,7 @@ int performLanczosIteration( return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -235,8 +295,8 @@ int performLanczosIteration( * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t *v, value_type_t *Pv, - value_type_t *P) { +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = - std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t *v, value_type_t *A) { +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, value_type_t shift1, - value_type_t shift2, value_type_t *alpha, - value_type_t *beta, value_type_t *V, - value_type_t *work) { +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, - 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); + findHouseholder3(householder, beta + pos, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); + findHouseholder3(householder, beta + n - 4, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @return error flag. */ template -static int lanczosRestart( - handle_t const &handle, index_type_t n, index_type_t iter, - index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, bool smallest_eig) { +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -501,12 +577,12 @@ static int lanczosRestart( index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t *ritzVals_host = work_host + 3 * iter; + value_type_t* ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t *shifts_host; + value_type_t* shifts_host; // Orthonormal matrix for similarity transform - value_type_t *V_dev = work_dev + n * iter; + value_type_t* V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -524,7 +600,8 @@ static int lanczosRestart( // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) + V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -548,49 +625,71 @@ static int lanczosRestart( // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = - cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, - beta_host, V_host, work_host)) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = - beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, - n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, - &one, lanczosVecs_dev, n, V_dev, iter, &zero, - work_dev, n, stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, - beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, - lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -601,7 +700,7 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -651,19 +750,28 @@ static int lanczosRestart( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -683,21 +791,20 @@ int computeSmallestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -710,12 +817,11 @@ int computeSmallestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -738,10 +844,18 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -756,9 +870,17 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -775,9 +897,19 @@ int computeSmallestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -786,9 +918,17 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -799,39 +939,59 @@ int computeSmallestEigenvectors( } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -869,20 +1029,25 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ using namespace spectral; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -892,8 +1057,8 @@ int computeSmallestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -901,10 +1066,23 @@ int computeSmallestEigenvectors( // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = computeSmallestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -914,7 +1092,7 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -959,19 +1137,27 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -987,8 +1173,8 @@ int computeLargestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -998,15 +1184,14 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -1019,12 +1204,11 @@ int computeLargestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1044,13 +1228,21 @@ int computeLargestEigenvectors( CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t *shift = &shift_val; - - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1067,9 +1259,19 @@ int computeLargestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1078,9 +1280,17 @@ int computeLargestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1090,15 +1300,18 @@ int computeLargestEigenvectors( WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1123,36 +1336,52 @@ int computeLargestEigenvectors( //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync( - eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1190,18 +1419,23 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1211,18 +1445,30 @@ int computeLargestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = computeLargestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index aff08da2d3..200818fdc3 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -24,21 +24,18 @@ namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template -void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); @@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - mapImpl(out, len, map, stream, in, - args...); +void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index f2f198670a..78a7017c5c 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,50 +24,66 @@ namespace raft { namespace linalg { -struct sum_tag {}; +struct sum_tag { +}; template -__device__ void reduce(OutType *out, const InType acc, sum_tag) { +__device__ void reduce(OutType* out, const InType acc, sum_tag) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { - raft::myAtomicAdd(out, tmp); - } + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } template -__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { +__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { - raft::myAtomicReduce(out, tmp, op); - } + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } } -template -__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, - MapOp map, ReduceLambda op, - const InType *in, Args... args) { +template +__global__ void mapThenReduceKernel(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) +{ OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - acc = map(in[idx], args[idx]...); - } + if (idx < len) { acc = map(in[idx], args[idx]...); } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduceImpl(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel @@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, * @param args additional input arrays */ -template -void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ mapThenReduceImpl( out, len, (OutType)0, map, sum_tag(), stream, in, args...); } @@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 93f2d746fa..81c1919b2e 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -27,19 +27,24 @@ namespace { template struct AlignedAccess { template - static inline bool test(const T *matrix, size_t strideBytes) { - return Pow2::isAligned(matrix) && - Pow2::isAligned(strideBytes) && + static inline bool test(const T* matrix, size_t strideBytes) + { + return Pow2::isAligned(matrix) && Pow2::isAligned(strideBytes) && Pow2::isAligned(VecBytes); } }; }; // namespace template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = threadIdx.x; @@ -70,17 +75,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType len = N * D; - IdxType nblks = - raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType len = N * D; + IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -108,10 +117,17 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); if (AlignedAccess<16>::test(matrix, stride_bytes)) { @@ -138,10 +154,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, ///@todo: come up with a cleaner interface to support these cases in future! template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector1, const Type *vector2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector1, + const Type* vector2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; @@ -174,15 +196,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -211,10 +239,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); if (AlignedAccess<16>::test(matrix, stride_bytes)) { diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 9d1538c172..a3fcc5bac6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -24,7 +24,7 @@ namespace linalg { /** * @brief CUDA version mean squared error function mean((A-B)**2) * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block + * @tparam TPB threads-per-block * @param out the output mean squared error value (assumed to be a device pointer) * @param A input array (assumed to be a device pointer) * @param B input array (assumed to be a device pointer) @@ -33,14 +33,14 @@ namespace linalg { * @param stream cuda-stream where to launch this kernel */ template -void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, - math_t weight, cudaStream_t stream) { +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { math_t diff = a - b; return diff * diff * weight / len; }; - mapThenSumReduce(out, len, sq_diff, stream, A, - B); + mapThenSumReduce(out, len, sq_diff, stream, A, B); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index ce948c927d..53d57ecd00 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 64930a7123..82558c8023 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm }; * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void rowNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } @@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void colNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index a50448acbe..c85cfda934 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -41,14 +41,18 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); @@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK( - cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } /** @@ -84,29 +85,40 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; rmm::device_uvector R_full(m * n, stream); rmm::device_uvector tau(min(m, n), stream); - CUDA_CHECK( - cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Lwork; rmm::device_scalar devInfo(stream); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, - R_full_ncols, R_full.data(), - R_full_nrows, &Lwork)); + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, - tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, + R_full_nrows, + R_full_ncols, + R_full.data(), + R_full_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); @@ -114,17 +126,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, - min(Q_ncols, Q_nrows), Q, Q_nrows, - tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), - workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, + Q_nrows, + Q_ncols, + min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index d39577bbdd..693a797db9 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -52,28 +52,33 @@ namespace linalg { * @param reduce_op binary reduction operation * @param final_op elementwise operation to apply before storing results */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType *dots, const InType *data, int D, int N, OutType init, - bool rowMajor, bool alongRows, cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index bba652e137..f931c976fd 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -28,14 +28,15 @@ namespace linalg { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, - int N, Type init, MainLambda main_op) { +__global__ void stridedSummationKernel( + Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) +{ // Thread reduction Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; + int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; thread_data += main_op(data[idx], j); @@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, // Block reduction extern __shared__ char tmp[]; // One element per thread in block - Type *temp = (Type *)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; + Type* temp = (Type*)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { @@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicAdd(dots + colStart, temp[myidx]); + if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); } // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout -template -__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, - int N, OutType init, MainLambda main_op, - ReduceLambda reduce_op) { +template +__global__ void stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) +{ // Thread reduction OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; + IdxType stride = blockDim.y * gridDim.y; for (IdxType j = rowStart; j < N; j += stride) { IdxType idx = colStart + j * D; thread_data = reduce_op(thread_data, main_op(data[idx], j)); @@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto *temp = (OutType *)tmp; // Cast to desired type + extern __shared__ char tmp[]; // One element per thread in block + auto* temp = (OutType*)tmp; // Cast to desired type IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; + temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) - temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); __syncthreads(); } @@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ ///@todo: this extra should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) @@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, // Arbitrary numbers for now, probably need to tune const dim3 thrds(32, 16); IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; @@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, <<>>(dots, data, D, N, init, main_op); else stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, - reduce_op); + <<>>(dots, data, D, N, init, main_op, reduce_op); ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 882c105689..43060d0818 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -38,8 +38,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; unaryOp(out, in, len, op, stream); } @@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @remark block size has not been tuned */ template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 2315920689..e14a5b6a50 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -51,12 +51,20 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 // 46340: sqrt of max int value @@ -71,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int n = n_cols; rmm::device_scalar devInfo(stream); - T *d_rwork = nullptr; + T* d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK( - cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); rmm::device_uvector d_work(lwork, stream); - char jobu = 'S'; + char jobu = 'S'; char jobvt = 'A'; if (!gen_left_vec) { @@ -91,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd( - cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, - right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngesvd(cusolverH, + jobu, + jobvt, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + d_rwork, + devInfo.data(), + stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); @@ -109,18 +130,36 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, } template -void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, - T *U, T *V, bool gen_left_vec, cudaStream_t stream) { +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; rmm::device_uvector in_cross_mult(len, stream); T alpha = T(1); - T beta = T(0); - raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), - n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + T beta = T(0); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + in, + in_cross_mult.data(), + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, stream); eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); @@ -131,10 +170,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { - raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); - raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, - true, stream); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + V, + U, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream); } } @@ -156,10 +205,19 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -174,18 +232,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, rmm::device_scalar devInfo(stream); int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); rmm::device_uvector d_work(lwork, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), - gesvdj_params, stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -204,16 +286,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, - math_t *V, math_t *out, int n_rows, int n_cols, int k, - cudaStream_t stream) { +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ const math_t alpha = 1.0, beta = 0.0; rmm::device_uvector SVT(k * n_cols, stream); - raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, - CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); } /** @@ -231,9 +331,17 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; @@ -257,16 +365,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; rmm::device_uvector A_minus_P(m * n, stream); - CUDA_CHECK( - cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, - &alpha, A_d, m, &beta, P_d.data(), m, - A_minus_P.data(), m, stream)); - - math_t norm_A_minus_P = - raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; + CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); + + math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index db1cabd694..e84ddd1166 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -33,18 +33,34 @@ namespace linalg { * @param stream: cuda stream */ template -void transpose(const raft::handle_t &handle, math_t *in, math_t *out, - int n_rows, int n_cols, cudaStream_t stream) { +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); int out_n_rows = n_cols; int out_n_cols = n_rows; const math_t alpha = 1.0; - const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam( - cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, - n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } /** @@ -54,24 +70,24 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out, * @param stream: cuda stream */ template -void transpose(math_t *inout, int n, cudaStream_t stream) { - auto m = n; - auto size = n * n; - auto d_inout = inout; +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + auto m = n; + auto size = n * n; + auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 46b4d296cb..198b9b2b10 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -23,10 +23,9 @@ namespace raft { namespace linalg { -template -__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, - Lambda op) { +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a; @@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, b.store(out, idx); } -template -void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); } else if (1 / maxSize) { - unaryOpImpl( - out, in, len, op, stream); + unaryOpImpl(out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, - stream); + unaryOpImpl(out, in, len, op, stream); } } template -__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { - op(out + idx, idx); - } + if (idx < len) { op(out + idx, idx); } } /** @@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel - <<>>(out, len, op); + writeOnlyUnaryOpKernel<<>>(out, len, op); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index f79cb397b7..4b56f3986f 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -25,30 +25,29 @@ namespace detail { // Computes the argmax(d_in) column-wise in a DxN matrix template -__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { +__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(-1, -raft::myInf()); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - if (threadIdx.x == 0) { - argmax[blockIdx.x] = maxKV.key; - } + if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } } template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ int D = n_rows; int N = n_cols; if (D <= 32) { @@ -67,39 +66,39 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T *d_in, int D, int N) { +__global__ void signFlipKernel(T* d_in, int D, int N) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(0, 0); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); // flip column sign if d_in[maxIndex] < 0 __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { - need_sign_flip = d_in[maxKV.key] < T(0); - } + if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); } __syncthreads(); if (need_sign_flip) { for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; d_in[idx] = -d_in[idx]; } } } template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ + int D = n_rows; + int N = n_cols; auto data = inout; if (D <= 32) { signFlipKernel<<>>(data, D, N); diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 8293d01bdb..709570ae56 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -28,29 +28,32 @@ namespace matrix { namespace detail { template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ if (rowMajor) { const idx_t TPB = 256; - cache:: - get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); + cache::get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); CUDA_CHECK(cudaPeekAtLastError()); return; } - idx_t size = n_rows_indices * n_cols; + idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; - out[col * n_rows_indices + row] = - in[col * n_rows + indices[row]]; - }); + out[col * n_rows_indices + row] = in[col * n_rows + indices[row]]; + }); } /** @@ -65,8 +68,9 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * (1-based) */ template -__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, - idx_t y1, idx_t x2, idx_t y2) { +__global__ void slice( + m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { @@ -77,8 +81,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, } template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ // Slicing dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); @@ -94,21 +106,19 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, - idx_t n_cols, idx_t k) { +__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; if (idx < m * n) { idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { - dst[i + j * k] = src[idx]; - } + if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; } } } template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t m = n_rows, n = n_cols; idx_t k = min(m, n); dim3 block(64); @@ -125,23 +135,21 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, - idx_t n, idx_t k) { +__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { - matrix[idx + idx * m] = vec[idx]; - } + if (idx < k) { matrix[idx + idx * m] = vec[idx]; } } template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t k = min(n_rows, n_cols); dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, - n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); } /** @@ -151,15 +159,15 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { +__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { - in[idx + idx * len] = 1.0 / in[idx + idx * len]; - } + if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } } template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ dim3 block(64); dim3 grid((len + block.x - 1) / block.x); matrixDiagonalInverse<<>>(in, len); diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp index e67440019f..df6eb6f489 100644 --- a/cpp/include/raft/matrix/math.hpp +++ b/cpp/include/raft/matrix/math.hpp @@ -43,14 +43,18 @@ namespace matrix { * @param stream cuda stream */ template -void power(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream) { - auto d_src = in; +void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) +{ + auto d_src = in; auto d_dest = out; raft::linalg::binaryOp( - d_dest, d_src, d_src, len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); + d_dest, + d_src, + d_src, + len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, + stream); } /** @@ -61,7 +65,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len, * @param stream cuda stream */ template -void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { +void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) +{ power(inout, inout, scalar, len, stream); } @@ -72,7 +77,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { * @param stream cuda stream */ template -void power(math_t *inout, int len, cudaStream_t stream) { +void power(math_t* inout, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(inout, scalar, len, stream); } @@ -86,7 +92,8 @@ void power(math_t *inout, int len, cudaStream_t stream) { * @{ */ template -void power(math_t *in, math_t *out, int len, cudaStream_t stream) { +void power(math_t* in, math_t* out, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(in, out, scalar, len, stream); } @@ -103,13 +110,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) { * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, - cudaStream_t stream, bool set_neg_zero = false) { - auto d_src = in; +void seqRoot(math_t* in, + math_t* out, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool set_neg_zero = false) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (set_neg_zero) { if (a < math_t(0)) { @@ -135,8 +149,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool set_neg_zero = false) { +void seqRoot( + math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) +{ seqRoot(inout, inout, scalar, len, stream, set_neg_zero); } @@ -150,22 +165,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(in, out, scalar, len, stream); } template -void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(inout, inout, scalar, len, stream); } template -void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, - cudaStream_t stream, math_t thres = 1e-15) { +void setSmallValuesZero( + math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ raft::linalg::unaryOp( - out, in, len, + out, + in, + len, [=] __device__(math_t a) { if (a <= thres && -a <= thres) { return math_t(0); @@ -186,8 +206,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, * @param thres: threshold */ template -void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, - math_t thres = 1e-15) { +void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ setSmallValuesZero(inout, inout, len, stream, thres); } @@ -205,14 +225,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, * @{ */ template -void reciprocal(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream, bool setzero = false, - math_t thres = 1e-15) { - auto d_src = in; +void reciprocal(math_t* in, + math_t* out, + math_t scalar, + int len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (setzero) { if (abs(a) <= thres) { @@ -239,8 +266,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len, * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0) */ template -void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool setzero = false, math_t thres = 1e-15) { +void reciprocal(math_t* inout, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ reciprocal(inout, inout, scalar, len, stream, setzero, thres); } @@ -253,7 +285,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(inout, scalar, len, stream); } @@ -268,14 +301,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { * @param stream cuda stream */ template -void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(in, out, scalar, len, stream); } template -void setValue(math_t *out, const math_t *in, math_t scalar, int len, - cudaStream_t stream = 0) { +void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) +{ raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); } @@ -290,18 +324,18 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param stream cuda stream */ template -void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, - cudaStream_t stream) { - auto d_src = src; +void ratio( + const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) +{ + auto d_src = src; auto d_dest = dest; rmm::device_scalar d_sum(stream); - auto *d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; + auto* d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, - stream); + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); } /** @} */ @@ -315,8 +349,8 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, * @param stream: cuda stream */ template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ detail::argmax(in, n_rows, n_cols, out, stream); } @@ -329,25 +363,49 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, * @param stream cuda stream */ template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ detail::signFlip(inout, n_rows, n_cols, stream); } template -void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryMult(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, + stream); } template -void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream) { +void matrixVectorBinaryMultSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (b == Type(0)) return a; @@ -358,22 +416,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryDiv(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, + stream); } template -void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream, - bool return_zero = false) { +void matrixVectorBinaryDivSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream, + bool return_zero = false) +{ if (return_zero) { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return Type(0); @@ -383,7 +464,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, stream); } else { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return a; @@ -395,21 +482,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryAdd(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } template -void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinarySub(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index 8dd9fbf487..c4cd30b7bc 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -47,11 +47,16 @@ using namespace std; * @param rowMajor whether the matrix has row major layout */ template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { - detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, - rowMajor); +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ + detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor); } /** @@ -63,8 +68,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * @param stream: cuda stream */ template -void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ raft::copy_async(out, in, n_rows * n_cols, stream); } @@ -79,21 +84,21 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, - idx_t out_n_cols, cudaStream_t stream) { - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; +void truncZeroOrigin( + m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) +{ + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); } /** @@ -105,24 +110,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, * @param stream: cuda stream */ template -void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), - [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -134,25 +140,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param stream: cuda stream */ template -void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), - [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -164,16 +171,19 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param v_separator: vertical separator character */ template -void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', - char v_separator = '\n', - cudaStream_t stream = rmm::cuda_stream_default) { +void print(const m_t* in, + idx_t n_rows, + idx_t n_cols, + char h_separator = ' ', + char v_separator = '\n', + cudaStream_t stream = rmm::cuda_stream_default) +{ std::vector h_matrix = std::vector(n_cols * n_rows); raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], - j < n_cols - 1 ? h_separator : v_separator); + printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); } } } @@ -185,7 +195,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', * @param n_cols: number of columns of input matrix */ template -void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { +void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) +{ for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { printf("%1.4f ", in[j * n_rows + i]); @@ -208,8 +219,16 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { * @param stream: cuda stream */ template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream); } @@ -222,8 +241,8 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param stream: cuda stream */ template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream); } @@ -236,8 +255,9 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream); } @@ -248,7 +268,8 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ detail::getDiagonalInverseMatrix(in, len, stream); } @@ -260,12 +281,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { * @param stream: cuda stream */ template -m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size, - cudaStream_t stream) { +m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - CUBLAS_CHECK( - raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + m_t normval = 0; + CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 4a2362bf97..38ef59aadf 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -38,11 +38,11 @@ namespace mr { template class buffer_base { public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; using const_reference = const T&; buffer_base() = delete; @@ -58,16 +58,12 @@ class buffer_base { * @param[in] stream cuda stream where this allocation operations are async * @param[in] n size of the buffer (in number of elements) */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, - size_type n = 0) - : data_(nullptr), - size_(n), - capacity_(n), - stream_(stream), - allocator_(std::move(allocator)) { + buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) + : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) + { if (capacity_ > 0) { - data_ = static_cast( - allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + data_ = + static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); CUDA_CHECK(cudaStreamSynchronize(stream_)); } } @@ -100,23 +96,23 @@ class buffer_base { * @param[in] new_capacity new capacity (in number of elements) * @{ */ - void reserve(size_type new_capacity) { + void reserve(size_type new_capacity) + { if (new_capacity > capacity_) { - auto* new_data = static_cast( - allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { - raft::copy(new_data, data_, size_, stream_); - } + auto* new_data = + static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } // Only deallocate if we have allocated a pointer if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = new_data; + data_ = new_data; capacity_ = new_capacity; } } - void reserve(size_type new_capacity, cudaStream_t stream) { + void reserve(size_type new_capacity, cudaStream_t stream) + { set_stream(stream); reserve(new_capacity); } @@ -128,12 +124,14 @@ class buffer_base { * @param[in] new_size new buffer size * @{ */ - void resize(const size_type new_size) { + void resize(const size_type new_size) + { reserve(new_size); size_ = new_size; } - void resize(const size_type new_size, cudaStream_t stream) { + void resize(const size_type new_size, cudaStream_t stream) + { set_stream(stream); resize(new_size); } @@ -145,16 +143,18 @@ class buffer_base { * If this method is not explicitly called, it will be during the destructor * @{ */ - void release() { + void release() + { if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = nullptr; + data_ = nullptr; capacity_ = 0; - size_ = 0; + size_ = 0; } - void release(cudaStream_t stream) { + void release(cudaStream_t stream) + { set_stream(stream); release(); } @@ -194,7 +194,8 @@ class buffer_base { * @param[in] stream new cuda stream to be set. If it is the same as the * current one, then this method will be a no-op. */ - void set_stream(cudaStream_t stream) { + void set_stream(cudaStream_t stream) + { if (stream_ != stream) { cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index 3d1ce38c31..8d306a199f 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -34,17 +34,20 @@ namespace device { * further to the ones listed in `Allocator`: * - Allocations may be always on the device that was specified on construction. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default device allocator based on the one provided by RMM */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { rmm::mr::get_current_device_resource()->deallocate(p, n, stream); } }; // class default_allocator diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp index 39b5674ce4..2b9d84368f 100644 --- a/cpp/include/raft/mr/device/buffer.hpp +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -46,11 +46,11 @@ namespace device { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -60,7 +60,9 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } }; // class buffer }; // namespace device diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index e5b3da24eb..7d31248e7f 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -35,20 +35,23 @@ namespace host { * further to the ones listed in `Allocator`: * - Allocations don't need to be zero copy accessible form a device. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default cudaMallocHost/cudaFreeHost based host allocator */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = nullptr; CUDA_CHECK(cudaMallocHost(&ptr, n)); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - //Must call _NO_THROW here since this is called frequently from object - //destructors which are "nothrow" by default + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { + // Must call _NO_THROW here since this is called frequently from object + // destructors which are "nothrow" by default CUDA_CHECK_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index 3c505bf2ed..52475ad6ec 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -48,11 +48,11 @@ namespace host { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -62,14 +62,15 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) { - if (other.size() > 0) { - raft::copy(data_, other.data(), other.size(), other.get_stream()); - } + : buffer_base(alloc, other.get_stream(), other.size()) + { + if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } } buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } reference operator[](size_type pos) { return data_[pos]; } diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index de5fc46452..56a3192f9f 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -29,14 +29,13 @@ template struct Pow2 { typedef decltype(Value_) Type; static constexpr Type Value = Value_; - static constexpr Type Log2 = log2(Value); - static constexpr Type Mask = Value - 1; + static constexpr Type Log2 = log2(Value); + static constexpr Type Mask = Value - 1; static_assert(std::is_integral::value, "Value must be integral."); static_assert(Value && !(Value & Mask), "Value must be power of two."); -#define Pow2_IsRepresentableAs(I) \ - (std::is_integral::value && Type(I(Value)) == Value) +#define Pow2_IsRepresentableAs(I) (std::is_integral::value && Type(I(Value)) == Value) /** * Integer division by Value truncated toward zero @@ -45,10 +44,9 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)` */ template - static constexpr HDI std::enable_if_t quot( - I x) noexcept { - if constexpr (std::is_signed::value) - return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); + static constexpr HDI std::enable_if_t quot(I x) noexcept + { + if constexpr (std::is_signed::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); if constexpr (std::is_unsigned::value) return x >> I(Log2); } @@ -59,10 +57,9 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)`. */ template - static constexpr HDI std::enable_if_t rem( - I x) noexcept { - if constexpr (std::is_signed::value) - return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); + static constexpr HDI std::enable_if_t rem(I x) noexcept + { + if constexpr (std::is_signed::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); if constexpr (std::is_unsigned::value) return x & I(Mask); } @@ -77,8 +74,8 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t div( - I x) noexcept { + static constexpr HDI std::enable_if_t div(I x) noexcept + { return x >> I(Log2); } @@ -94,8 +91,8 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t mod( - I x) noexcept { + static constexpr HDI std::enable_if_t mod(I x) noexcept + { return x & I(Mask); } @@ -108,16 +105,17 @@ struct Pow2 { * NB: for pointers, the alignment is checked in bytes, not in elements. */ template - static constexpr HDI bool isAligned(PtrT p) noexcept { + static constexpr HDI bool isAligned(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0; - if constexpr (!Pow2_IsRepresentableAs(PtrT)) - return mod(reinterpret_cast(p)) == 0; + if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast(p)) == 0; } /** Tell whether two pointers have the same address modulo Value. */ template - static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { + static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept + { Pow2_CHECK_TYPE(PtrT); Pow2_CHECK_TYPE(PtrS); Type x, y; @@ -134,10 +132,10 @@ struct Pow2 { /** Get this or next Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundUp(PtrT p) noexcept { + static constexpr HDI PtrT roundUp(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IsRepresentableAs(PtrT)) - return p + PtrT(Mask) - mod(p + PtrT(Mask)); + if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask)); if constexpr (!Pow2_IsRepresentableAs(PtrT)) { auto x = reinterpret_cast(p); return reinterpret_cast(x + Mask - mod(x + Mask)); @@ -146,7 +144,8 @@ struct Pow2 { /** Get this or previous Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundDown(PtrT p) noexcept { + static constexpr HDI PtrT roundDown(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p); if constexpr (!Pow2_IsRepresentableAs(PtrT)) { diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh index 654c46bbf9..0f3b58975e 100644 --- a/cpp/include/raft/random/detail/rng_impl.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -44,19 +44,20 @@ enum GeneratorType { }; template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ + constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; Type s, c; raft::mySinCos(theta, s, c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } @@ -67,10 +68,13 @@ DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { template struct Generator { DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) {} + : gen(seed, subsequence, offset) + { + } template - DI void next(Type &ret) { + DI void next(Type& ret) + { gen.next(ret); } @@ -79,10 +83,9 @@ struct Generator { GenType gen; }; -template -__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda randOp) { +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -94,10 +97,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, } // used for Box-Muller type transformations -template -__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda2 rand2Op) { +template +__global__ void rand2Kernel( + uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -113,8 +116,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, } template -__global__ void constFillKernel(Type *ptr, int len, Type val) { - unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void constFillKernel(Type* ptr, int len, Type val) +{ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; const unsigned stride = gridDim.x * blockDim.x; for (unsigned idx = tid; idx < len; idx += stride) { ptr[idx] = val; @@ -130,7 +134,8 @@ struct PhiloxGenerator { * @param subsequence as found in curand docs * @param offset as found in curand docs */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { curand_init(seed, subsequence, offset, &state); } @@ -138,21 +143,24 @@ struct PhiloxGenerator { * @defgroup NextRand Generate the next random number * @{ */ - DI void next(float &ret) { ret = curand_uniform(&(this->state)); } - DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); } - DI void next(uint32_t &ret) { ret = curand(&(this->state)); } - DI void next(uint64_t &ret) { + DI void next(float& ret) { ret = curand_uniform(&(this->state)); } + DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } + DI void next(uint32_t& ret) { ret = curand(&(this->state)); } + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -173,8 +181,9 @@ struct TapsGenerator { * @param subsequence unused * @param offset unused */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; uint64_t stride = blockDim.x * gridDim.x; delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; stride *= blockDim.y * gridDim.y; @@ -187,31 +196,36 @@ struct TapsGenerator { * @{ */ template - DI void next(Type &ret) { + DI void next(Type& ret) + { constexpr double ULL_LARGE = 1.8446744073709551614e19; uint64_t val; next(val); ret = static_cast(val); ret /= static_cast(ULL_LARGE); } - DI void next(uint64_t &ret) { + DI void next(uint64_t& ret) + { constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; + constexpr int ROUNDS = 128; for (int i = 0; i < ROUNDS; i++) state = (state >> 1) ^ (-(state & 1ULL) & TAPS); ret = state; } - DI void next(uint32_t &ret) { + DI void next(uint32_t& ret) + { uint64_t val; next(val); ret = (uint32_t)val; } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -232,46 +246,49 @@ struct Kiss99Generator { * @param subsequence unused * @param offset unused */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - initKiss99(seed); - } + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); } /** * @defgroup NextRand Generate the next random number * @{ */ template - DI void next(Type &ret) { + DI void next(Type& ret) + { constexpr double U_LARGE = 4.294967295e9; uint32_t val; next(val); ret = static_cast(val); ret /= static_cast(U_LARGE); } - DI void next(uint32_t &ret) { + DI void next(uint32_t& ret) + { uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); MWC = ((z << 16) + w); jsr ^= (jsr << 17); jsr ^= (jsr >> 13); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; } - DI void next(uint64_t &ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -290,7 +307,8 @@ struct Kiss99Generator { // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t *h) { + DI void mulByFnv1a128Prime(uint32_t* h) + { typedef union { uint32_t u32[2]; uint64_t u64[1]; @@ -314,12 +332,12 @@ struct Kiss99Generator { // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); uint32_t carry = 0; - h[0] = h0p0.u32[0]; + h[0] = h0p0.u32[0]; - h[1] = h0p0.u32[1] + h1p0.u32[0]; + h[1] = h0p0.u32[1] + h1p0.u32[0]; carry = h[1] < h0p0.u32[1] ? 1 : 0; - h[2] = h1p0.u32[1] + carry; + h[2] = h1p0.u32[1] + carry; carry = h[2] < h1p0.u32[1] ? 1 : 0; h[2] += h2p0.u32[0]; carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; @@ -330,7 +348,8 @@ struct Kiss99Generator { return; } - DI void fnv1a128(uint32_t *hash, uint32_t txt) { + DI void fnv1a128(uint32_t* hash, uint32_t txt) + { hash[0] ^= (txt >> 0) & 0xFF; mulByFnv1a128Prime(hash); hash[0] ^= (txt >> 8) & 0xFF; @@ -341,7 +360,8 @@ struct Kiss99Generator { mulByFnv1a128Prime(hash); } - DI void initKiss99(uint64_t seed) { + DI void initKiss99(uint64_t seed) + { // Initialize hash to 128-bit FNV1a basis uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; @@ -356,9 +376,9 @@ struct Kiss99Generator { fnv1a128(hash, uint32_t(seed >> 32)); // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; + z = hash[0]; + w = hash[1]; + jsr = hash[2]; jcong = hash[3]; } }; @@ -372,17 +392,20 @@ class RngImpl { // simple heuristic to make sure all SMs will be occupied properly // and also not too many initialization calls will be made by each thread nBlocks(4 * getMultiProcessorCount()), - gen() { + gen() + { seed(_s); } - void seed(uint64_t _s) { + void seed(uint64_t _s) + { gen.seed(_s); offset = 0; } template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { // always keep 'a' to be coprime to 'n' a = gen() % n; while (gcd(a, n) != 1) { @@ -394,128 +417,150 @@ class RngImpl { } template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return (val * (end - start)) + start; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; }, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'uniformInt' can only be integer!"); + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'uniformInt' can only be integer!"); custom_distribution( - ptr, len, - [=] __device__(IntType val, LenType idx) { - return (val % (end - start)) + start; - }, + ptr, + len, + [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; }, stream); } template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'normal' can only be floating point!"); rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'normalInt' can only be integer!"); + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'normalInt' can only be integer!"); rand2Impl( - offset, ptr, len, - [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { + offset, + ptr, + len, + [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { rand2Impl( - offset, ptr, n_rows * n_cols, + offset, + ptr, + n_rows * n_cols, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { // yikes! use fast-int-div - auto col1 = idx1 % n_cols; - auto col2 = idx2 % n_cols; + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; auto mean1 = mu[col1]; auto mean2 = mu[col2]; - auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; - auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { - detail::constFillKernel - <<>>(ptr, len, val); + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { + detail::constFillKernel<<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { custom_distribution( - ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, - stream); + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream); } template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'scaled_bernoulli' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return val > prob ? -scale : scale; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, stream); } template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return mu - beta * raft::myLog(-raft::myLog(val)); - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); }, stream); } template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); val1 = raft::myExp(val1); val2 = raft::myExp(val2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return mu - scale * raft::myLog(one / val - one); @@ -524,9 +569,11 @@ class RngImpl { } template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return -raft::myLog(one - val) / lambda; @@ -535,9 +582,11 @@ class RngImpl { } template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; constexpr Type two = (Type)2.0; @@ -547,13 +596,14 @@ class RngImpl { } template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { - constexpr Type one = (Type)1.0; - constexpr Type two = (Type)2.0; + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; constexpr Type oneHalf = (Type)0.5; Type out; if (val <= oneHalf) { @@ -567,55 +617,55 @@ class RngImpl { } template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - ASSERT(sampledLen <= len, - "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); rmm::device_uvector expWts(len, stream); rmm::device_uvector sortedWts(len, stream); rmm::device_uvector inIdx(len, stream); rmm::device_uvector outIdxBuff(len, stream); - auto *inIdxPtr = inIdx.data(); + auto* inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( - expWts.data(), len, + expWts.data(), + len, [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { - inIdxPtr[idx] = idx; + inIdxPtr[idx] = idx; constexpr WeightsT one = (WeightsT)1.0; - auto exp = -raft::myLog(one - val); - if (wts != nullptr) { - return exp / wts[idx]; - } + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { return exp / wts[idx]; } return exp; }, stream); ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items - IdxT *outIdxPtr = outIdxBuff.data(); + IdxT* outIdxPtr = outIdxBuff.data(); rmm::device_uvector workspace(0, stream); - sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, - (int)len, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } raft::scatter(out, in, outIdxPtr, sampledLen, stream); } - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { randImpl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { rand2Impl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } @@ -625,10 +675,10 @@ class RngImpl { /** generator type */ GeneratorType type; /** - * offset is also used to initialize curand state. - * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), - * but is still a large period. - */ + * offset is also used to initialize curand state. + * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), + * but is still a large period. + */ uint64_t offset; /** number of blocks to launch */ int nBlocks; @@ -638,12 +688,10 @@ class RngImpl { static const int NumThreads = 256; template - uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, - int nThreads, int nBlocks) { + uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks) + { LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); - if (IsNormal && itemsPerThread % 2 == 1) { - ++itemsPerThread; - } + if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; } // curand uses 2 32b uint's to generate one double uint64_t factor = sizeof(Type) / sizeof(float); if (factor == 0) ++factor; @@ -651,72 +699,72 @@ class RngImpl { // If not, then generate new seed and start from zero offset uint64_t newOffset = offset + LenType(itemsPerThread) * factor; if (newOffset < offset) { - offset = 0; - seed = gen(); + offset = 0; + seed = gen(); newOffset = itemsPerThread * factor; } return newOffset; } - template - void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void randImpl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda randOp, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - uint64_t seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenTaps: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenKiss99: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; - default: - ASSERT(false, "randImpl: Incorrect generator type! %d", type); + default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; } - template - void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void rand2Impl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda2 rand2Op, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - auto seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; - default: - ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp index b6b0911ab0..0cced7c626 100644 --- a/cpp/include/raft/random/rng.hpp +++ b/cpp/include/raft/random/rng.hpp @@ -51,12 +51,13 @@ using detail::Kiss99Generator; * @{ */ template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2); } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ detail::box_muller_transform(val1, val2, sigma1, mu1); } /** @} */ @@ -92,7 +93,8 @@ class Rng : public detail::RngImpl { * @param[out] b intercept parameter */ template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { detail::RngImpl::affine_transform_params(n, a, b); } @@ -108,13 +110,13 @@ class Rng : public detail::RngImpl { * @{ */ template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { detail::RngImpl::uniform(ptr, len, start, end, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { detail::RngImpl::uniformInt(ptr, len, start, end, stream); } /** @} */ @@ -131,13 +133,13 @@ class Rng : public detail::RngImpl { * @{ */ template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { detail::RngImpl::normal(ptr, len, mu, sigma, stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { detail::RngImpl::normalInt(ptr, len, mu, sigma, stream); } /** @} */ @@ -163,10 +165,15 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { - detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, - stream); + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { + detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream); } /** @@ -179,7 +186,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { detail::RngImpl::fill(ptr, len, val, stream); } @@ -196,7 +204,8 @@ class Rng : public detail::RngImpl { * @param[in] stream stream where to launch the kernel */ template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { detail::RngImpl::bernoulli(ptr, len, prob, stream); } @@ -211,8 +220,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream); } @@ -228,7 +237,8 @@ class Rng : public detail::RngImpl { * @note https://en.wikipedia.org/wiki/Gumbel_distribution */ template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { detail::RngImpl::gumbel(ptr, len, mu, beta, stream); } @@ -243,8 +253,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { detail::RngImpl::lognormal(ptr, len, mu, sigma, stream); } @@ -259,8 +269,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { detail::RngImpl::logistic(ptr, len, mu, scale, stream); } @@ -274,7 +284,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { detail::RngImpl::exponential(ptr, len, lambda, stream); } @@ -288,7 +299,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { detail::RngImpl::rayleigh(ptr, len, sigma, stream); } @@ -303,8 +315,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { detail::RngImpl::laplace(ptr, len, mu, scale, stream); } @@ -334,12 +346,17 @@ class Rng : public detail::RngImpl { * @param stream cuda stream */ template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts, - sampledLen, len, stream); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + detail::RngImpl::sampleWithoutReplacement( + handle, out, outIdx, in, wts, sampledLen, len, stream); } /** @@ -357,16 +374,14 @@ class Rng : public detail::RngImpl { * @param[in] stream cuda stream * @{ */ - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { detail::RngImpl::custom_distribution(ptr, len, randOp, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { detail::RngImpl::custom_distribution2(ptr, len, randOp, stream); } /** @} */ diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh index e367550060..5d38bdf4a8 100644 --- a/cpp/include/raft/sparse/convert/coo.cuh +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -37,14 +37,18 @@ namespace sparse { namespace convert { template -__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, - value_idx *coo_rows, value_idx nnz) { +__global__ void csr_to_coo_kernel(const value_idx* row_ind, + value_idx m, + value_idx* coo_rows, + value_idx nnz) +{ // row-based matrix 1 thread per row value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { value_idx start_idx = row_ind[row]; - value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); - for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) + coo_rows[i] = row; } } @@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, * @param stream: cuda stream to use */ template -void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, - value_idx nnz, cudaStream_t stream) { +void csr_to_coo( + const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream) +{ // @TODO: Use cusparse for this. dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_to_coo_kernel - <<>>(row_ind, m, coo_rows, nnz); + csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 79b18ebd0a..2569b5d90f 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -43,28 +43,32 @@ namespace sparse { namespace convert { template -void coo_to_csr(const raft::handle_t &handle, const int *srcRows, - const int *srcCols, const value_t *srcVals, int nnz, int m, - int *dst_offsets, int *dstCols, value_t *dstVals) { - auto stream = handle.get_stream(); +void coo_to_csr(const raft::handle_t& handle, + const int* srcRows, + const int* srcCols, + const value_t* srcVals, + int nnz, + int m, + int* dst_offsets, + int* dstCols, + value_t* dstVals) +{ + auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); rmm::device_uvector dstRows(nnz, stream); - CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); rmm::device_uvector pBuffer(buffSize, stream); rmm::device_uvector P(nnz, stream); - CUSPARSE_CHECK( - cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); - raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), - dstCols, P.data(), pBuffer.data(), stream); - raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), - stream); - raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, - dst_offsets, stream); + CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow( + cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -83,14 +87,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, * @param stream cuda stream to use * @param fused_op: the fused operation */ -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ op::csr_row_op( - row_ind, batchSize, nnz, + row_ind, + batchSize, + nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( Index_ row, Index_ start_idx, Index_ stop_idx) { fused_op(row, start_idx, stop_idx); @@ -106,14 +116,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, stream); } -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched( - row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, - [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream) +{ + csr_adj_graph_batched(row_ind, + total_rows, + nnz, + batchSize, + adj, + row_ind_ptr, + stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); } /** @@ -129,13 +148,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream cuda stream to use * @param fused_op the fused operation */ -template void> -void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { - csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, - adj, row_ind_ptr, stream, fused_op); +template void> +void csr_adj_graph(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ + csr_adj_graph_batched( + row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op); } /** @@ -148,8 +171,8 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, - cudaStream_t stream) { +void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream) +{ rmm::device_uvector row_counts(m, stream); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -157,11 +180,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); } /** @@ -172,7 +193,8 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream) { +void sorted_coo_to_csr(COO* coo, int* row_ind, cudaStream_t stream) +{ sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream); } diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh index 299f9d36d4..e90882b501 100644 --- a/cpp/include/raft/sparse/convert/dense.cuh +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -37,22 +37,20 @@ namespace sparse { namespace convert { template -__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, - const value_t *csrVal, - const int *csrRowPtr, - const int *csrColInd, - value_t *a) { +__global__ void csr_to_dense_warp_per_row_kernel( + int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a) +{ int row = blockIdx.x; int tid = threadIdx.x; int colStart = csrRowPtr[row]; - int colEnd = csrRowPtr[row + 1]; - int rowNnz = colEnd - colStart; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; for (int i = tid; i < rowNnz; i += blockDim.x) { int colIdx = colStart + i; if (colIdx < colEnd) { - int col = csrColInd[colIdx]; + int col = csrColInd[colIdx]; a[row * n_cols + col] = csrVal[colIdx]; } } @@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols, * @param[in] row_major : Is row-major output desired? */ template -void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data, value_idx lda, value_t *out, - cudaStream_t stream, bool row_major = true) { +void csr_to_dense(cusparseHandle_t handle, + value_idx nrows, + value_idx ncols, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx lda, + value_t* out, + cudaStream_t stream, + bool row_major = true) +{ if (!row_major) { /** * If we need col-major, use cusparse. @@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( - handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, - lda, stream)); + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK( - cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index fa21614f8f..ad1bac1e75 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -66,79 +66,79 @@ class COO { Index_Type n_cols; /** - * @param stream: CUDA stream to use - */ + * @param stream: CUDA stream to use + */ COO(cudaStream_t stream) - : rows_arr(0, stream), - cols_arr(0, stream), - vals_arr(0, stream), - nnz(0), - n_rows(0), - n_cols(0) {} + : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0) + { + } /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - COO(rmm::device_uvector &rows, - rmm::device_uvector &cols, rmm::device_uvector &vals, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) - : rows_arr(rows), - cols_arr(cols), - vals_arr(vals), - nnz(nnz), - n_rows(n_rows), - n_cols(n_cols) {} + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(rmm::device_uvector& rows, + rmm::device_uvector& cols, + rmm::device_uvector& vals, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0) + : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols) + { + } /** - * @param stream: CUDA stream to use - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays with zeros - */ - COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, - Index_Type n_cols = 0, bool init = true) + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(cudaStream_t stream, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0, + bool init = true) : rows_arr(nnz, stream), cols_arr(nnz, stream), vals_arr(nnz, stream), nnz(nnz), n_rows(n_rows), - n_cols(n_cols) { + n_cols(n_cols) + { if (init) init_arrays(stream); } - void init_arrays(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK( - cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + void init_arrays(cudaStream_t stream) + { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const { + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const + { if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; return true; } /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const { - if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || - this->vals_arr.size() == 0) { + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const + { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) { return false; } @@ -148,33 +148,30 @@ class COO { /* * @brief Returns the rows array */ - Index_Type *rows() { return this->rows_arr.data(); } + Index_Type* rows() { return this->rows_arr.data(); } /** * @brief Returns the cols array */ - Index_Type *cols() { return this->cols_arr.data(); } + Index_Type* cols() { return this->cols_arr.data(); } /** * @brief Returns the vals array */ - T *vals() { return this->vals_arr.data(); } + T* vals() { return this->vals_arr.data(); } /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream &operator<<(std::ostream &out, - const COO &c) { + * @brief Send human-readable state information to output stream + */ + friend std::ostream& operator<<(std::ostream& out, const COO& c) + { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) - << std::endl; - out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) - << std::endl; - out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) - << std::endl; + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl; out << "nnz=" << c.nnz << std::endl; out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; @@ -188,58 +185,59 @@ class COO { } /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) { + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) + { this->n_rows = n_rows; this->n_cols = n_cols; } /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) { + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) + { this->n_rows = n; this->n_cols = n; } /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, bool init, cudaStream_t stream) { - this->allocate(nnz, 0, init, stream); - } + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, int size, bool init, cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) + { this->allocate(nnz, size, size, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param init: should values be initialized to 0? - * @param stream: stream to use for init - */ - void allocate(int nnz, int n_rows, int n_cols, bool init, - cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream) + { this->n_rows = n_rows; this->n_cols = n_cols; - this->nnz = nnz; + this->nnz = nnz; this->rows_arr.resize(this->nnz, stream); this->cols_arr.resize(this->nnz, stream); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index 041aedf41c..f821ce2b98 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -41,57 +41,64 @@ namespace sparse { struct WeakCCState { public: - bool *m; - WeakCCState(bool *m) : m(m) {} + bool* m; + WeakCCState(bool* m) : m(m) {} }; template -__global__ void weak_cc_label_device(Index_ *__restrict__ labels, - const Index_ *__restrict__ row_ind, - const Index_ *__restrict__ row_ind_ptr, - Index_ nnz, bool *__restrict__ m, - Index_ start_vertex_id, Index_ batch_size, - Index_ N, Lambda filter_op) { - Index_ tid = threadIdx.x + blockIdx.x * TPB_X; +__global__ void weak_cc_label_device(Index_* __restrict__ labels, + const Index_* __restrict__ row_ind, + const Index_* __restrict__ row_ind_ptr, + Index_ nnz, + bool* __restrict__ m, + Index_ start_vertex_id, + Index_ batch_size, + Index_ N, + Lambda filter_op) +{ + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; Index_ global_id = tid + start_vertex_id; if (tid < batch_size && global_id < N) { Index_ start = __ldg(row_ind + tid); Index_ ci, cj; - bool ci_mod = false; - ci = labels[global_id]; + bool ci_mod = false; + ci = labels[global_id]; bool ci_allow_prop = filter_op(global_id); Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); /// TODO: add one element to row_ind and avoid get_stop_idx for (Index_ j = start; j < end; j++) { - Index_ j_ind = __ldg(row_ind_ptr + j); - cj = labels[j_ind]; + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; bool cj_allow_prop = filter_op(j_ind); if (ci < cj && ci_allow_prop) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + j_ind), ci); + atomicMin((int*)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + j_ind), ci); + atomicMin((long long int*)(labels + j_ind), ci); if (cj_allow_prop) *m = true; } else if (ci > cj && cj_allow_prop) { - ci = cj; + ci = cj; ci_mod = true; } } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + global_id), ci); + atomicMin((int*)(labels + global_id), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + global_id), ci); + atomicMin((long long int*)(labels + global_id), ci); if (ci_allow_prop) *m = true; } } } template -__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, - Index_ MAX_LABEL, Lambda filter_op) { +__global__ void weak_cc_init_all_kernel(Index_* labels, + Index_ N, + Index_ MAX_LABEL, + Lambda filter_op) +{ Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (filter_op(tid)) @@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream, - Lambda filter_op) { - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, - "Index_ should be 4 or 8 bytes"); +template bool> +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream, + Lambda filter_op) +{ + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes"); bool host_m; Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel - <<>>( - labels, N, MAX_LABEL, filter_op); + <<>>(labels, N, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); int n_iters = 0; @@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, weak_cc_label_device <<>>( - labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, - batch_size, N, filter_op); + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); CUDA_CHECK(cudaPeekAtLastError()); //** Updating m * @@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param stream the cuda stream to use */ template -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream) { - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, - batch_size, state, stream, +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream) +{ + weak_cc_batched(labels, + row_ind, + row_ind_ptr, + nnz, + N, + start_vertex_id, + batch_size, + state, + stream, [] __device__(Index_ tid) { return true; }); } @@ -212,14 +234,18 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) { +template bool> +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + cudaStream_t stream, + Lambda filter_op) +{ rmm::device_scalar m(stream); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, filter_op); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); } /** @@ -244,12 +270,17 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param stream the cuda stream to use */ template -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, cudaStream_t stream) { +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + cudaStream_t stream) +{ rmm::device_scalar m(stream); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, [](Index_) { return true; }); + weak_cc_batched( + labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); } }; // namespace sparse diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index d072100672..29a244a962 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -23,10 +23,9 @@ //#include #define _CUSPARSE_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; -//Notes: +// Notes: //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic; //(2.) to enforce a lower version, // @@ -43,16 +42,15 @@ namespace raft { * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) - : raft::exception(message) {} - explicit cusparse_error(std::string const& message) - : raft::exception(message) {} + explicit cusparse_error(char const* const message) : raft::exception(message) {} + explicit cusparse_error(std::string const& message) : raft::exception(message) {} }; namespace sparse { namespace detail { -inline const char* cusparse_error_to_string(cusparseStatus_t err) { +inline const char* cusparse_error_to_string(cusparseStatus_t err) +{ #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(err); #else // CUDART_VERSION @@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: - return "CUSPARSE_STATUS_UNKNOWN"; + default: return "CUSPARSE_STATUS_UNKNOWN"; }; #endif // CUDART_VERSION } @@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::sparse::detail::cusparse_error_to_string(status)); \ throw raft::cusparse_error(msg); \ } \ @@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ - do { \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ - } \ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -117,28 +119,34 @@ namespace sparse { * @{ */ template -cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, - T* vals_sorted, int* d_P, cudaStream_t stream); +cusparseStatus_t cusparsegthr( + cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const double* vals, double* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const double* vals, + double* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const float* vals, float* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const float* vals, + float* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } /** @} */ @@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, * @{ */ template -void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, - int m, T* csrRowPtr, cudaStream_t stream); +void cusparsecoo2csr( + cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream); template <> -inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, - int nnz, int m, int* csrRowPtr, - cudaStream_t stream) { +inline void cusparsecoo2csr(cusparseHandle_t handle, + const int* cooRowInd, + int nnz, + int m, + int* csrRowPtr, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, */ template size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, - const T* cooCols, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* cooRows, + const T* cooCols, + cudaStream_t stream); template <> inline size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, - const int* cooCols, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cooRows, + const int* cooCols, + cudaStream_t stream) +{ size_t val; CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); return val; } template void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, - void* pBuffer, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + T* cooRows, + T* cooCols, + T* P, + void* pBuffer, + cudaStream_t stream); template <> inline void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, - int* P, void* pBuffer, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRows, + int* cooCols, + int* P, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); } /** @} */ @@ -199,37 +234,67 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, - const T* A, int lda, const T* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const T* alpha, + const T* A, + int lda, + const T* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const float* alpha, - const float* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float* alpha, + const float* A, + int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, - float* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseSgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const double* alpha, - const double* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double* alpha, + const double* A, + int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, - double* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseDgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } /** @} */ @@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, */ template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, int64_t nnz, - IndexT* csrRowOffsets, IndexT* csrColInd, + int64_t rows, + int64_t cols, + int64_t nnz, + IndexT* csrRowOffsets, + IndexT* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } /** @} */ @@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, * @{ */ template -cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, T* values); +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values); template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, float* values) { + int64_t size, + float* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, double* values) { + int64_t size, + double* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); } /** @} */ @@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, */ template cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, int64_t ld, - T* values, cusparseOrder_t order); + int64_t rows, + int64_t cols, + int64_t ld, + T* values, + cusparseOrder_t order); template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, float* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + float* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order); } template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, double* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + double* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order); } /** @} */ @@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, * @{ */ template -cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_32F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_64F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize); } template -cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, const T* beta, +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, T* externalBuffer, + cusparseSpMVAlg_t alg, + T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - double* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, - float* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, const double* x, - const double* beta, double* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } /** @} */ #endif @@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv( * @{ */ template -cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_32F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_64F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize); } template -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + T* externalBuffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_32F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_64F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm( */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, - const T* beta, T* y, const int ldy, cudaStream_t stream); + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const float* alpha, const cusparseMatDescr_t descr, - const float* csrVal, const int* csrRowPtr, const int* csrColInd, - const float* x, const int ldx, const float* beta, float* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const double* alpha, const cusparseMatDescr_t descr, - const double* csrVal, const int* csrRowPtr, const int* csrColInd, - const double* x, const int ldx, const double* beta, double* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ #endif @@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm( */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, - T* cooRowInd, cudaStream_t stream); + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, - const int* csrRowPtr, int* cooRowInd, - cudaStream_t stream) { +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSetPointerMode(handle, mode); } @@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, - CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, - CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); -} -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, - CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, - CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); +cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + size_t* bufferSizeInBytes, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + bufferSizeInBytes); } template -cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, T* buffer, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, float* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, - descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, - CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, - CUDA_R_32F, buffer); -} -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, double* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, - descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, - CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, - CUDA_R_64F, buffer); +cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + T* buffer, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + float* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + double* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + buffer); } /** @} */ @@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex( */ template -cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + bufferSize); } template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + bufferSize); } template -cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_32F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + buffer); } template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_64F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + buffer); } /** @} */ @@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc( */ template -cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta, - const cusparseMatDescr_t matA, int nnzA, const int* rowindA, - const int* indicesA, const cusparseMatDescr_t matB, int nnzB, - const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD, - int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info, - size_t* pBufferSizeInBytes, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const float* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const T* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const float* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseScsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const double* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const double* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseDcsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } -inline cusparseStatus_t cusparsecsrgemm2nnz( - cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA, - int nnzA, const int* rowindA, const int* indicesA, - const cusparseMatDescr_t matB, int nnzB, const int* rowindB, - const int* indicesB, const cusparseMatDescr_t matD, int nnzD, - const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC, - int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + const cusparseMatDescr_t matC, + int* rowindC, + int* nnzC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA, - matB, nnzB, rowindB, indicesB, matD, nnzD, - rowindD, indicesD, matC, rowindC, nnzC, info, + return cusparseXcsrgemm2Nnz(handle, + m, + n, + k, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + matD, + nnzD, + rowindD, + indicesD, + matC, + rowindC, + nnzC, + info, pBuffer); #pragma GCC diagnostic pop } template -cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, - const cusparseMatDescr_t descrA, int nnzA, const T* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD, - const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC, - T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info, - void* pBuffer, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const cusparseMatDescr_t descrA, int nnzA, const float* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const float* beta, const cusparseMatDescr_t descrD, int nnzD, - const float* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const T* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const T* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const T* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + T* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + float* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseScsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const cusparseMatDescr_t descrA, int nnzA, const double* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const double* beta, const cusparseMatDescr_t descrD, int nnzD, - const double* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + double* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseDcsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } @@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2( */ template -cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, - const int* csrColIndA, T* A, int lda, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + T* A, + int lda, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, - const int* csrColIndA, float* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + float* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, - const int* csrColIndA, double* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + double* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } /** @} */ diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 1c55412eec..29c823bcdb 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -24,31 +24,31 @@ namespace distance { template struct distances_config_t { - distances_config_t(const raft::handle_t &handle_) : handle(handle_) {} + distances_config_t(const raft::handle_t& handle_) : handle(handle_) {} // left side value_idx a_nrows; value_idx a_ncols; value_idx a_nnz; - value_idx *a_indptr; - value_idx *a_indices; - value_t *a_data; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; // right side value_idx b_nrows; value_idx b_ncols; value_idx b_nnz; - value_idx *b_indptr; - value_idx *b_indices; - value_t *b_data; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; - const raft::handle_t &handle; + const raft::handle_t& handle; }; template class distances_t { public: - virtual void compute(value_t *out) {} + virtual void compute(value_t* out) {} virtual ~distances_t() = default; }; diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 3f8c32a20b..4d3b31df9a 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -35,9 +35,11 @@ namespace distance { namespace detail { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_binary_row_norm_kernel( - value_t *out, const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, value_idx nnz) { +__global__ void compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; if (i < nnz) { // We do conditional here only because it's @@ -49,54 +51,63 @@ __global__ void compute_binary_row_norm_kernel( } template -__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, - const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t q_norm = Q_norms[i]; - value_t r_norm = R_norms[j]; - value_t dot = C[(size_t)i * n_cols + j]; + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); } -template -void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func, cudaStream_t stream) { +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_binary_warp_kernel<<>>( C, Q_norms, R_norms, n_rows, n_cols, expansion_func); } -template -void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, - cudaStream_t stream, expansion_f expansion_func) { +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_binary_row_norm_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, - stream); + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); } /** @@ -106,44 +117,51 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, template class jaccard_expanded_distances_t : public distances_t { public: - explicit jaccard_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t denom = q_r_union - dot; - - value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); - - // flip the similarity when both rows are 0 - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * jacc) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); } ~jaccard_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -155,40 +173,47 @@ class jaccard_expanded_distances_t : public distances_t { template class dice_expanded_distances_t : public distances_t { public: - explicit dice_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t dice = (2 * dot) / q_r_union; - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * dice) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); } ~dice_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index 83844b8c54..6694d0fc4f 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -39,19 +39,29 @@ namespace sparse { namespace distance { namespace detail { -template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); - - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, - chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; /** @@ -87,39 +97,55 @@ inline void balanced_coo_pairwise_generalized_spmv( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); if (max_cols > config_.a_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } }; -template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); }; /** @@ -158,24 +184,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ // try dense first int max_cols = max_cols_per_block(); if (max_cols > config_.b_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh index 866ff43224..9bfdd3bad0 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh @@ -27,68 +27,88 @@ namespace sparse { namespace distance { namespace detail { /** - * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with - * sparse-matrix-sparse-vector multiplication layout (SPMV). - * This is intended to be scheduled n_chunks_b times for each row of a. - * The steps are as follows: - * - * 1. Load row from A into dense vector in shared memory. - * This can be further chunked in the future if necessary to support larger - * column sizes. - * 2. Threads of block all step through chunks of B in parallel. - * When a new row is encountered in row_indices_b, a segmented - * reduction is performed across the warps and then across the - * block and the final value written out to host memory. - * - * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf - * - * @tparam value_idx index type - * @tparam value_t value type - * @tparam tpb threads per block configured on launch - * @tparam rev if this is true, the reduce/accumulate functions are only - * executed when A[col] == 0.0. when executed before/after !rev - * and A & B are reversed, this allows the full symmetric difference - * and intersection to be computed. - * @tparam kv_t data type stored in shared mem cache - * @tparam product_f reduce function type (semiring product() function). - * accepts two arguments of value_t and returns a value_t - * @tparam accum_f accumulation function type (semiring sum() function). - * accepts two arguments of value_t and returns a value_t - * @tparam write_f function to write value out. this should be mathematically - * equivalent to the accumulate function but implemented as - * an atomic operation on global memory. Accepts two arguments - * of value_t* and value_t and updates the value given by the - * pointer. - * @param[in] indptrA column pointer array for A - * @param[in] indicesA column indices array for A - * @param[in] dataA data array for A - * @param[in] rowsB coo row array for B - * @param[in] indicesB column indices array for B - * @param[in] dataB data array for B - * @param[in] m number of rows in A - * @param[in] n number of rows in B - * @param[in] dim number of features - * @param[in] nnz_b number of nonzeros in B - * @param[out] out array of size m*n - * @param[in] n_blocks_per_row number of blocks of B per row of A - * @param[in] chunk_size number of nnz for B to use for each row of A - * @param[in] buffer_size amount of smem to use for each row of A - * @param[in] product_func semiring product() function - * @param[in] accum_func semiring sum() function - * @param[in] write_func atomic semiring sum() function - */ -template -__global__ void balanced_coo_generalized_spmv_kernel( - strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA, - value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB, - value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out, - int n_blocks_per_row, int chunk_size, value_idx b_ncols, - product_f product_func, accum_f accum_func, write_f write_func) { + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ typedef cub::WarpReduce warp_reduce; - value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; // chunk starting offset @@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel( // how many total cols will be processed by this block (should be <= chunk_size * n_threads) value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); - int tid = threadIdx.x; + int tid = threadIdx.x; int warp_id = tid / raft::warp_size(); // compute id relative to current warp unsigned int lane_id = tid & (raft::warp_size() - 1); - value_idx ind = ind_offset + threadIdx.x; + value_idx ind = ind_offset + threadIdx.x; extern __shared__ char smem[]; - typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); - typename warp_reduce::TempStorage *temp_storage = - (typename warp_reduce::TempStorage *)(A + dim); + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); auto inserter = strategy.init_insert(A, dim); @@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( value_idx start_offset_a, stop_offset_a; bool first_a_chunk, last_a_chunk; - indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a, - n_blocks_per_row, first_a_chunk, last_a_chunk); + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); // Convert current row vector in A to dense for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { - strategy.insert(inserter, indicesA[start_offset_a + i], - dataA[start_offset_a + i]); + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); } __syncthreads(); @@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (ind >= nnz_b) return; value_idx start_index_a = 0, stop_index_a = b_ncols - 1; - indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a, - stop_offset_a, start_index_a, stop_index_a, - first_a_chunk, last_a_chunk); + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); value_idx cur_row_b = -1; - value_t c = 0.0; + value_t c = 0.0; auto warp_red = warp_reduce(*(temp_storage + warp_id)); if (tid < active_chunk_size) { cur_row_b = rowsB[ind]; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = product_func(a_col, dataB[ind]); - } + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } } } // loop through chunks in parallel, reducing when a new row is // encountered by each thread for (int i = tid; i < active_chunk_size; i += blockDim.x) { - value_idx ind_next = ind + blockDim.x; + value_idx ind_next = ind + blockDim.x; value_idx next_row_b = -1; if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; @@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel( // grab the threads currently participating in loops. // because any other threads should have returned already. unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); - bool is_leader = get_lowest_peer(peer_group) == lane_id; - value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); // thread with lowest lane id among peers writes out if (is_leader && v != 0.0) { // this conditional should be uniform, since rev is constant - size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b - : (size_t)cur_row_b * m + cur_row_a; + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; write_func(out + idx, v); } @@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (next_row_b != -1) { ind = ind_next; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = accum_func(c, product_func(a_col, dataB[ind])); - } + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } } cur_row_b = next_row_b; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh index 4ad3368c4a..9b1dfff022 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh @@ -31,58 +31,114 @@ namespace detail { template class coo_spmv_strategy { public: - coo_spmv_strategy(const distances_config_t &config_) - : config(config_) { + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { smem = raft::getSharedMemPerBlock(); } - template - void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr, - value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size, int n_blocks, - int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz, - coo_rows_b, config.b_indices, config.b_data, config.a_nrows, - config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row, - chunk_size, config.b_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); } - template - void _dispatch_base_rev(strategy_t &strategy, int smem_dim, - indptr_it &b_indptr, value_t *out_dists, - value_idx *coo_rows_a, product_f product_func, - accum_f accum_func, write_f write_func, - int chunk_size, int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz, - coo_rows_a, config.a_indices, config.a_data, config.b_nrows, - config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row, - chunk_size, config.a_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); } protected: int smem; - const distances_config_t &config; + const distances_config_t& config; }; } // namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh index 0ab7b65ac2..da51767307 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -29,11 +29,15 @@ namespace detail { template class mask_row_it { public: - mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_ = NULL) - : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {} + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { if (mask_row_idx != NULL) { return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; } else { @@ -41,37 +45,49 @@ class mask_row_it { } } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { start_offset = full_indptr[row_idx]; - stop_offset = full_indptr[row_idx + 1] - 1; + stop_offset = full_indptr[row_idx + 1] - 1; } - __device__ constexpr inline void get_indices_boundary( - const value_idx *indices, value_idx &indices_len, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { // do nothing; } - __device__ constexpr inline bool check_indices_bounds( - value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) { + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return true; } const value_idx *full_indptr, &n_rows; - value_idx *mask_row_idx; + value_idx* mask_row_idx; }; template -__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, - value_idx *chunk_indices, - value_idx n_rows) { +__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n_rows) { auto start = n_chunks_per_row[tid]; - auto end = n_chunks_per_row[tid + 1]; + auto end = n_chunks_per_row[tid + 1]; #pragma unroll for (int i = start; i < end; i++) { @@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, template class chunked_mask_row_it : public mask_row_it { public: - chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_, int row_chunk_size_, - const value_idx *n_chunks_per_row_, - const value_idx *chunk_indices_, + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, const cudaStream_t stream_) : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), row_chunk_size(row_chunk_size_), n_chunks_per_row(n_chunks_per_row_), chunk_indices(chunk_indices_), - stream(stream_) {} + stream(stream_) + { + } - static void init(const value_idx *indptr, const value_idx *mask_row_idx, - const value_idx &n_rows, const int row_chunk_size, - rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, - cudaStream_t stream) { + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); constexpr value_idx first_element = 0; n_chunks_per_row.set_element_async(0, first_element, stream); n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); - thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows, - n_chunks_per_row.begin() + 1, chunk_functor); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); - thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1, - n_chunks_per_row.end(), - n_chunks_per_row.begin() + 1); + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); - raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, - stream); + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { - auto chunk_index = blockIdx.x / n_blocks_nnz_b; - auto chunk_val = chunk_indices[chunk_index]; - auto prev_n_chunks = n_chunks_per_row[chunk_val]; + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; auto relative_chunk = chunk_index - prev_n_chunks; - first_a_chunk = relative_chunk == 0; + first_a_chunk = relative_chunk == 0; start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; - stop_offset = start_offset + row_chunk_size; + stop_offset = start_offset + row_chunk_size; auto final_stop_offset = this->full_indptr[row_idx + 1]; last_a_chunk = stop_offset >= final_stop_offset; - stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; } - __device__ inline void get_indices_boundary( - const value_idx *indices, value_idx &row_idx, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; - stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; } - __device__ inline bool check_indices_bounds(value_idx &start_index_a, - value_idx &stop_index_a, - value_idx &index_b) { + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return (index_b >= start_index_a && index_b <= stop_index_a); } @@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it { struct n_chunks_per_row_functor { public: - n_chunks_per_row_functor(const value_idx *indptr_, - value_idx row_chunk_size_) - : indptr(indptr_), row_chunk_size(row_chunk_size_) {} + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } - __host__ __device__ value_idx operator()(const value_idx &i) { + __host__ __device__ value_idx operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return raft::ceildiv(degree, (value_idx)row_chunk_size); } - const value_idx *indptr; + const value_idx* indptr; value_idx row_chunk_size; }; private: - static void fill_chunk_indices( - const value_idx &n_rows, rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, cudaStream_t stream) { + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto n_threads = std::min(n_rows, 256); - auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); chunk_indices.resize(total_row_blocks, stream); - fill_chunk_indices_kernel<<>>( - n_chunks_per_row.data(), chunk_indices.data(), n_rows); + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); } }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh index 79a5f154d0..5a1c152bd0 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh @@ -26,71 +26,91 @@ namespace detail { template class dense_smem_strategy : public coo_spmv_strategy { public: - using smem_type = value_t *; + using smem_type = value_t*; using insert_type = smem_type; - using find_type = smem_type; + using find_type = smem_type; - dense_smem_strategy(const distances_config_t &config_) - : coo_spmv_strategy(config_) {} + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } - inline static int smem_per_block(int n_cols) { - return (n_cols * sizeof(value_t)) + - ((1024 / raft::warp_size()) * sizeof(value_t)); + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.b_nnz, chunk_size * 1024); - auto n_blocks = this->config.a_nrows * n_blocks_per_row; - - mask_row_it a_indptr(this->config.a_indptr, - this->config.a_nrows); - - this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists, - coo_rows_b, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.a_nnz, chunk_size * 1024); - auto n_blocks = this->config.b_nrows * n_blocks_per_row; - - mask_row_it b_indptr(this->config.b_indptr, - this->config.b_nrows); - - this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists, - coo_rows_a, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { cache[k] = 0.0; } return cache; } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { cache[key] = value; } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return cache; } - __device__ inline value_t find(find_type cache, const value_idx &key) { - return cache[key]; - } + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } }; } // namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh index 5ba2d5c102..4f8637b425 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh @@ -1,18 +1,18 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -39,177 +39,238 @@ template class hash_strategy : public coo_spmv_strategy { public: using insert_type = - typename cuco::static_map::device_mutable_view; - using smem_type = typename insert_type::slot_type *; + typename cuco::static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; using find_type = - typename cuco::static_map::device_view; + typename cuco::static_map::device_view; - hash_strategy(const distances_config_t &config_, - float capacity_threshold_ = 0.5, int map_size_ = get_map_size()) + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) : coo_spmv_strategy(config_), capacity_threshold(capacity_threshold_), - map_size(map_size_) {} + map_size(map_size_) + { + } - void chunking_needed(const value_idx *indptr, const value_idx n_rows, - rmm::device_uvector &mask_indptr, - std::tuple &n_rows_divided, - cudaStream_t stream) { + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { auto policy = this->config.handle.get_thrust_policy(); - auto less = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), mask_indptr.data(), - fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); std::get<0>(n_rows_divided) = less - mask_indptr.data(); auto more = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), less, - fits_in_hash_table(indptr, capacity_threshold * map_size, - std::numeric_limits::max())); + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); std::get<1>(n_rows_divided) = more - less; } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.a_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.a_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.a_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.a_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.a_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.b_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.b_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.b_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.b_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.b_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { return insert_type::make_from_uninitialized_slots( cooperative_groups::this_thread_block(), cache, cache_size, -1, 0); } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { auto success = cache.insert(cuco::pair(key, value)); } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return find_type(cache, cache_size, -1, 0); } - __device__ inline value_t find(find_type cache, const value_idx &key) { + __device__ inline value_t find(find_type cache, const value_idx& key) + { auto a_pair = cache.find(key); value_t a_col = 0.0; - if (a_pair != cache.end()) { - a_col = a_pair->second; - } + if (a_pair != cache.end()) { a_col = a_pair->second; } return a_col; } struct fits_in_hash_table { public: - fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_, - value_idx degree_r_) - : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {} + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } - __host__ __device__ bool operator()(const value_idx &i) { + __host__ __device__ bool operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return degree >= degree_l && degree < degree_r; } private: - const value_idx *indptr; + const value_idx* indptr; const value_idx degree_l, degree_r; }; - inline static int get_map_size() { - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(typename insert_type::slot_type); } diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 2cd7b670d8..bde979a993 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -42,35 +42,38 @@ template class ip_distances_t : public distances_t { public: /** - * Computes simple sparse inner product distances as sum(x_y * y_k) - * @param[in] config specifies inputs, outputs, and sizes - */ - ip_distances_t(const distances_config_t &config) - : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) { - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows_b.data(), config_->b_nnz, + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, config_->handle.get_stream()); } /** - * Performs pairwise distance computation and computes output distances - * @param out_distances dense output matrix (size a_nrows * b_nrows) - */ - void compute(value_t *out_distances) { + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t* out_distances) + { /** - * Compute pairwise distances and return dense matrix in row-major format - */ + * Compute pairwise distances and return dense matrix in row-major format + */ balanced_coo_pairwise_generalized_spmv( - out_distances, *config_, coo_rows_b.data(), Product(), Sum(), - AtomicAdd()); + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd()); } - value_idx *b_rows_coo() { return coo_rows_b.data(); } + value_idx* b_rows_coo() { return coo_rows_b.data(); } - value_t *b_data_coo() { return config_->b_data; } + value_t* b_data_coo() { return config_->b_data; } private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector coo_rows_b; }; diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index f06a15215c..a4a534823f 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -38,35 +38,36 @@ namespace detail { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_row_norm_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i] * data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } } template -__global__ void compute_row_sum_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } } template -__global__ void compute_euclidean_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; @@ -80,25 +81,29 @@ __global__ void compute_euclidean_warp_kernel( } template -__global__ void compute_correlation_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols, - value_idx n) { +__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t dot = C[(size_t)i * n_cols + j]; + value_t dot = C[(size_t)i * n_cols + j]; value_t Q_l1 = Q_norms[i]; value_t R_l1 = R_norms[j]; value_t Q_l2 = Q_sq_norms[i]; value_t R_l2 = R_sq_norms[j]; - value_t numer = n * dot - (Q_l1 * R_l1); + value_t numer = n * dot - (Q_l1 * R_l1); value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); @@ -108,56 +113,75 @@ __global__ void compute_correlation_warp_kernel( C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); } -template -void compute_euclidean(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, value_idx n_rows, - value_idx n_cols, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_euclidean_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); } -template -void compute_l2(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_row_norm_kernel<<>>( R_sq_norms.data(), R_coo_rows, R_data, R_nnz); - compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, - expansion_func); + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); } template -void compute_correlation(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, const value_t *Q_norms, - const value_t *R_norms, value_idx n_rows, - value_idx n_cols, value_idx n, cudaStream_t stream) { +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_correlation_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); } template -void compute_corr(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, - cudaStream_t stream) { +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, + cudaStream_t stream) +{ // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); @@ -166,15 +190,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -186,8 +206,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, compute_row_sum_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(), - R_norms.data(), m, n, n_cols, stream); + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); } /** @@ -197,35 +224,44 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, template class l2_expanded_distances_t : public distances_t { public: - explicit l2_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - return -2 * dot + q_norm + r_norm; - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); } ~l2_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; @@ -234,18 +270,21 @@ class l2_expanded_distances_t : public distances_t { * The expanded form is more efficient for sparse data. */ template -class l2_sqrt_expanded_distances_t - : public l2_expanded_distances_t { +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { public: - explicit l2_sqrt_expanded_distances_t( - const distances_config_t &config) - : l2_expanded_distances_t(config) {} + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } - void compute(value_t *out_dists) override { + void compute(value_t* out_dists) override + { l2_expanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -259,79 +298,96 @@ class l2_sqrt_expanded_distances_t template class correlation_expanded_distances_t : public distances_t { public: - explicit correlation_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_corr(out_dists, search_coo_rows.data(), config_->a_data, - config_->a_nnz, b_indices, b_data, config_->b_nnz, - config_->a_nrows, config_->b_nrows, config_->b_ncols, + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, config_->handle.get_stream()); } ~correlation_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; /** - * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) - * The expanded form is more efficient for sparse data. + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. */ template class cosine_expanded_distances_t : public distances_t { public: - explicit cosine_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); - // deal with potential for 0 in denominator by forcing 0/1 instead - value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); - - // flip the similarity when both rows are 0 - bool both_empty = (q_norm == 0) && (r_norm == 0); - return 1 - ((!both_empty * cos) + both_empty); - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); } ~cosine_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -348,25 +404,34 @@ class cosine_expanded_distances_t : public distances_t { template class hellinger_expanded_distances_t : public distances_t { public: - explicit hellinger_expanded_distances_t( - const distances_config_t &config) - : config_(&config), workspace(0, config.handle.get_stream()) {} + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; @@ -378,42 +443,43 @@ class hellinger_expanded_distances_t : public distances_t { ~hellinger_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; }; template class russelrao_expanded_distances_t : public distances_t { public: - explicit russelrao_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_t n_cols = config_->a_ncols; + value_t n_cols = config_->a_ncols; value_t n_cols_inv = 1.0 / n_cols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, config_->handle.get_stream()); - auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); - auto diags = thrust::counting_iterator(0); + auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); + auto diags = thrust::counting_iterator(0); value_idx b_nrows = config_->b_nrows; - thrust::for_each(exec_policy, diags, diags + config_->a_nrows, - [=] __device__(value_idx input) { - out_dists[input * b_nrows + input] = 0.0; - }); + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); } ~russelrao_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index c11369375b..f5e7c75988 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -39,23 +39,33 @@ namespace sparse { namespace distance { namespace detail { -template -void unexpanded_lp_distances( - value_t *out_dists, const distances_config_t *config_, - product_f product_func, accum_f accum_func, write_f write_func) { +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - coo_rows.data(), config_->a_nnz, + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv_rev( @@ -72,48 +82,51 @@ void unexpanded_lp_distances( template class l1_unexpanded_distances_t : public distances_t { public: - l1_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Sum(), AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class l2_unexpanded_distances_t : public distances_t { public: - l2_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, SqDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, SqDiff(), Sum(), AtomicAdd()); } protected: - const distances_config_t *config_; + const distances_config_t* config_; }; template -class l2_sqrt_unexpanded_distances_t - : public l2_unexpanded_distances_t { +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { public: - l2_sqrt_unexpanded_distances_t( - const distances_config_t &config) - : l2_unexpanded_distances_t(config) {} + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { l2_unexpanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -125,29 +138,33 @@ class l2_sqrt_unexpanded_distances_t template class linf_unexpanded_distances_t : public distances_t { public: - explicit linf_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Max(), AtomicMax()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Max(), AtomicMax()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class canberra_unexpanded_distances_t : public distances_t { public: - explicit canberra_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { value_t d = fabs(a) + fabs(b); @@ -155,70 +172,82 @@ class canberra_unexpanded_distances_t : public distances_t { // forcing 1/0 instead return ((d != 0) * fabs(a - b)) / (d + (d == 0)); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class lp_unexpanded_distances_t : public distances_t { public: - explicit lp_unexpanded_distances_t( - const distances_config_t &config, value_t p_) - : config_(&config), p(p_) {} + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, PDiff(p), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, PDiff(p), Sum(), AtomicAdd()); float one_over_p = 1.0f / p; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return pow(input, one_over_p); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; value_t p; }; template class hamming_unexpanded_distances_t : public distances_t { public: - explicit hamming_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, NotEqual(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, NotEqual(), Sum(), AtomicAdd()); value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return input * n_cols; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class jensen_shannon_unexpanded_distances_t : public distances_t { public: explicit jensen_shannon_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { - value_t m = 0.5f * (a + b); + value_t m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -228,49 +257,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { bool x_zero = x == 0; bool y_zero = y == 0; - return (-a * (!x_zero * log(x + x_zero))) + - (-b * (!y_zero * log(y + y_zero))); + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return sqrt(0.5 * input); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class kl_divergence_unexpanded_distances_t : public distances_t { public: explicit kl_divergence_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return 0.5 * input; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; }; // END namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh index 9f206095bf..b2c2e2172b 100644 --- a/cpp/include/raft/sparse/distance/detail/operators.cuh +++ b/cpp/include/raft/sparse/distance/detail/operators.cuh @@ -25,21 +25,24 @@ namespace detail { struct Sum { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a + b; } }; struct NotEqual { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a != b; } }; struct SqDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return (a - b) * (a - b); } }; @@ -50,44 +53,48 @@ struct PDiff { PDiff(float p_) : p(p_) {} template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return pow(a - b, p); } }; struct Max { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fmax(a, b); } }; struct AtomicAdd { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicAdd(a, b); } }; struct AtomicMax { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicMax(a, b); } }; struct Product { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a * b; } }; struct AbsDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fabs(a - b); } }; diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh index abfb7d24ea..8c01b33c1e 100644 --- a/cpp/include/raft/sparse/distance/detail/utils.cuh +++ b/cpp/include/raft/sparse/distance/detail/utils.cuh @@ -33,10 +33,10 @@ namespace detail { * @return the maximum number of columns that can be stored in smem */ template -inline int max_cols_per_block() { +inline int max_cols_per_block() +{ // max cols = (total smem available - cub reduction smem) - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(value_t); } diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 0aeabe5019..92c08654d2 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -71,90 +71,71 @@ static const std::unordered_set supportedDistance{ * @param[out] out dense output array (size A.nrows * B.nrows) * @param[in] input_config input argument configuration * @param[in] metric distance metric to use -* @param[in] metric_arg metric argument (used for Minkowski distance) + * @param[in] metric_arg metric argument (used for Minkowski distance) */ template -void pairwiseDistance(value_t *out, +void pairwiseDistance(value_t* out, distances_config_t input_config, - raft::distance::DistanceType metric, float metric_arg) { + raft::distance::DistanceType metric, + float metric_arg) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - detail::l2_expanded_distances_t(input_config) - .compute(out); + detail::l2_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtExpanded: - detail::l2_sqrt_expanded_distances_t(input_config) - .compute(out); + detail::l2_sqrt_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::InnerProduct: detail::ip_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2Unexpanded: - detail::l2_unexpanded_distances_t(input_config) - .compute(out); + detail::l2_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - detail::l2_sqrt_unexpanded_distances_t(input_config) - .compute(out); + detail::l2_sqrt_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L1: - detail::l1_unexpanded_distances_t(input_config) - .compute(out); + detail::l1_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::LpUnexpanded: - detail::lp_unexpanded_distances_t(input_config, - metric_arg) - .compute(out); + detail::lp_unexpanded_distances_t(input_config, metric_arg).compute(out); break; case raft::distance::DistanceType::Linf: - detail::linf_unexpanded_distances_t(input_config) - .compute(out); + detail::linf_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::Canberra: - detail::canberra_unexpanded_distances_t(input_config) - .compute(out); + detail::canberra_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JaccardExpanded: - detail::jaccard_expanded_distances_t(input_config) - .compute(out); + detail::jaccard_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CosineExpanded: - detail::cosine_expanded_distances_t(input_config) - .compute(out); + detail::cosine_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HellingerExpanded: - detail::hellinger_expanded_distances_t(input_config) - .compute(out); + detail::hellinger_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::DiceExpanded: - detail::dice_expanded_distances_t(input_config) - .compute(out); + detail::dice_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CorrelationExpanded: - detail::correlation_expanded_distances_t(input_config) - .compute(out); + detail::correlation_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::RusselRaoExpanded: - detail::russelrao_expanded_distances_t(input_config) - .compute(out); + detail::russelrao_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HammingUnexpanded: - detail::hamming_unexpanded_distances_t(input_config) - .compute(out); + detail::hamming_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JensenShannon: - detail::jensen_shannon_unexpanded_distances_t( - input_config) - .compute(out); + detail::jensen_shannon_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::KLDivergence: - detail::kl_divergence_unexpanded_distances_t( - input_config) - .compute(out); + detail::kl_divergence_unexpanded_distances_t(input_config).compute(out); break; - default: - THROW("Unsupported distance: %d", metric); + default: THROW("Unsupported distance: %d", metric); } } diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 29f541498b..1738dd7498 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -37,13 +37,15 @@ class linkage_output { value_idx n_leaves; value_idx n_connected_components; - value_idx *labels; // size: m + value_idx* labels; // size: m - value_idx *children; // size: (m-1, 2) + value_idx* children; // size: (m-1, 2) }; -class linkage_output_int_float : public linkage_output {}; -class linkage_output__int64_float : public linkage_output {}; +class linkage_output_int_float : public linkage_output { +}; +class linkage_output__int64_float : public linkage_output { +}; }; // namespace hierarchy }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 4ef2ac43e2..207cca7287 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -44,31 +44,32 @@ class UnionFind { value_idx n_indices; UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), - parent(2 * N_ - 1, -1), - size(2 * N_ - 1, 1), - next_label(N_) { + : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) + { memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); } - value_idx find(value_idx n) { + value_idx find(value_idx n) + { value_idx p; p = n; - while (parent[n] != -1) n = parent[n]; + while (parent[n] != -1) + n = parent[n]; // path compression while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; + p = parent[p == -1 ? n_indices - 1 : p]; parent[p == -1 ? n_indices - 1 : p] = n; } return n; } - void perform_union(value_idx m, value_idx n) { + void perform_union(value_idx m, value_idx n) + { size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; + parent[m] = next_label; + parent[n] = next_label; next_label += 1; } @@ -97,10 +98,15 @@ class UnionFind { * @param[out] out_size cluster sizes of output */ template -void build_dendrogram_host(const handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *data, - size_t nnz, value_idx *children, value_t *out_delta, - value_idx *out_size) { +void build_dendrogram_host(const handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* data, + size_t nnz, + value_idx* children, + value_t* out_delta, + value_idx* out_size) +{ auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -122,8 +128,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, UnionFind U(nnz + 1); for (std::size_t i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; + value_idx a = mst_src_h[i]; + value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; value_idx aa = U.find(a); @@ -131,10 +137,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, value_idx children_idx = i * 2; - children_h[children_idx] = aa; + children_h[children_idx] = aa; children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; + out_delta_h[i] = delta; + out_size_h[i] = U.size[aa] + U.size[bb]; U.perform_union(aa, bb); } @@ -145,13 +151,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, } template -__global__ void write_levels_kernel(const value_idx *children, - value_idx *parents, value_idx n_vertices) { +__global__ void write_levels_kernel(const value_idx* children, + value_idx* parents, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { value_idx level = tid / 2; value_idx child = children[tid]; - parents[child] = level; + parents[child] = level; } } @@ -167,14 +175,17 @@ __global__ void write_levels_kernel(const value_idx *children, * @param labels */ template -__global__ void inherit_labels(const value_idx *children, - const value_idx *levels, std::size_t n_leaves, - value_idx *labels, int cut_level, - value_idx n_vertices) { +__global__ void inherit_labels(const value_idx* children, + const value_idx* levels, + std::size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { - value_idx node = children[tid]; + value_idx node = children[tid]; value_idx cur_level = tid / 2; /** @@ -184,12 +195,12 @@ __global__ void inherit_labels(const value_idx *children, if (cur_level > cut_level) return; value_idx cur_parent = node; - value_idx label = labels[cur_parent]; + value_idx label = labels[cur_parent]; while (label == -1) { cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; + cur_level = levels[cur_parent]; + label = labels[cur_parent]; } labels[node] = label; @@ -198,15 +209,16 @@ __global__ void inherit_labels(const value_idx *children, template struct init_label_roots { - init_label_roots(value_idx *labels_) : labels(labels_) {} + init_label_roots(value_idx* labels_) : labels(labels_) {} template - __host__ __device__ void operator()(Tuple t) { + __host__ __device__ void operator()(Tuple t) + { labels[thrust::get<1>(t)] = thrust::get<0>(t); } private: - value_idx *labels; + value_idx* labels; }; /** @@ -222,10 +234,13 @@ struct init_label_roots { * @param n_leaves */ template -void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, size_t n_clusters, - size_t n_leaves) { - auto stream = handle.get_stream(); +void extract_flattened_clusters(const raft::handle_t& handle, + value_idx* labels, + const value_idx* children, + size_t n_clusters, + size_t n_leaves) +{ + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // Handle special case where n_clusters == 1 @@ -243,24 +258,21 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, auto n_edges = (n_leaves - 1) * 2; - thrust::device_ptr d_ptr = - thrust::device_pointer_cast(children); - value_idx n_vertices = - *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; + thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); + value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; // Prevent potential infinite loop from labeling disconnected // connectivities graph. RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive"); - RAFT_EXPECTS(static_cast(n_vertices) == - static_cast((n_leaves - 1) * 2), - "Multiple components found in MST or MST is invalid. " - "Cannot find single-linkage solution."); + RAFT_EXPECTS( + static_cast(n_vertices) == static_cast((n_leaves - 1) * 2), + "Multiple components found in MST or MST is invalid. " + "Cannot find single-linkage solution."); rmm::device_uvector levels(n_vertices, stream); value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), - n_vertices); + write_levels_kernel<<>>(children, levels.data(), n_vertices); /** * Step 1: Find label roots: * @@ -274,27 +286,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector label_roots(child_size, stream); value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, - child_size, stream); + raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); - thrust::sort(thrust_policy, label_roots.data(), + thrust::sort(thrust_policy, + label_roots.data(), label_roots.data() + (child_size), thrust::greater()); rmm::device_uvector tmp_labels(n_vertices, stream); // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), - tmp_labels.data() + n_vertices, -1); + thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); // Write labels for cluster roots to "labels" thrust::counting_iterator first(0); - auto z_iter = thrust::make_zip_iterator(thrust::make_tuple( - first, label_roots.data() + (label_roots.size() - n_clusters))); + auto z_iter = thrust::make_zip_iterator( + thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); - thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters, - init_label_roots(tmp_labels.data())); + thrust::for_each( + thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); /** * Step 2: Propagate labels by having children iterate through their parents @@ -304,9 +315,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, */ value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - inherit_labels<<>>(children, levels.data(), - n_leaves, tmp_labels.data(), - cut_level, n_vertices); + inherit_labels<<>>( + children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); // copy tmp labels to actual labels raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 31e4a0f263..c06c24e100 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -36,14 +36,17 @@ namespace raft { namespace hierarchy { namespace detail { -template +template struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c); }; /** @@ -52,37 +55,41 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, +struct distance_graph_impl { + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { - auto stream = handle.get_stream(); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) + { + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, - c); + raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); // self-loops get max distance - auto transform_in = thrust::make_zip_iterator(thrust::make_tuple( - knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform( - thrust_policy, transform_in, transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple &tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); + auto transform_in = thrust::make_zip_iterator( + thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); + + thrust::transform(thrust_policy, + transform_in, + transform_in + knn_graph_coo.nnz, + knn_graph_coo.vals(), + [=] __device__(const thrust::tuple& tup) { + bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); + return (self_loop * std::numeric_limits::max()) + + (!self_loop * thrust::get<2>(tup)); + }); raft::sparse::convert::sorted_coo_to_csr( knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); @@ -90,10 +97,8 @@ struct distance_graph_impl -void get_distance_graph(const raft::handle_t &handle, const value_t *X, - size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { +template +void get_distance_graph(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) +{ auto stream = handle.get_stream(); indptr.resize(m + 1, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 6ef6f9879b..0c0b049f11 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -34,9 +34,10 @@ namespace hierarchy { namespace detail { template -void merge_msts(raft::Graph_COO &coo1, - raft::Graph_COO &coo2, - cudaStream_t stream) { +void merge_msts(raft::Graph_COO& coo1, + raft::Graph_COO& coo2, + cudaStream_t stream) +{ /** Add edges to existing mst **/ int final_nnz = coo2.n_edges + coo1.n_edges; @@ -47,12 +48,9 @@ void merge_msts(raft::Graph_COO &coo1, /** * Construct final edge list */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), - coo2.n_edges, stream); + raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); + raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); + raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); coo1.n_edges = final_nnz; } @@ -71,12 +69,16 @@ void merge_msts(raft::Graph_COO &coo1, * @return updated MST edge list */ template -void connect_knn_graph(const raft::handle_t &handle, const value_t *X, - raft::Graph_COO &msf, - size_t m, size_t n, value_idx *color, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_knn_graph( + const raft::handle_t& handle, + const value_t* X, + raft::Graph_COO& msf, + size_t m, + size_t n, + value_idx* color, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto stream = handle.get_stream(); raft::sparse::COO connected_edges(stream); @@ -90,9 +92,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst( - handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m, - connected_edges.nnz, color, stream, false, false); + auto new_mst = raft::mst::mst(handle, + indptr2.data(), + connected_edges.cols(), + connected_edges.vals(), + m, + connected_edges.nnz, + color, + stream, + false, + false); merge_msts(msf, new_mst, stream); } @@ -122,28 +131,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, * argument is really just a safeguard against the potential for infinite loops. */ template -void build_sorted_mst(const raft::handle_t &handle, const value_t *X, - const value_idx *indptr, const value_idx *indices, - const value_t *pw_dists, size_t m, size_t n, - value_idx *mst_src, value_idx *mst_dst, - value_t *mst_weight, value_idx *color, size_t nnz, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) { +void build_sorted_mst( + const raft::handle_t& handle, + const value_t* X, + const value_idx* indptr, + const value_idx* indices, + const value_t* pw_dists, + size_t m, + size_t n, + value_idx* mst_src, + value_idx* mst_dst, + value_t* mst_weight, + value_idx* color, + size_t nnz, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, + int max_iter = 10) +{ auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. auto mst_coo = raft::mst::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, - true); + handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); - int iters = 1; + int iters = 1; int n_components = linkage::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, - reduction_op); + connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); iters++; @@ -170,9 +185,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, " or increase 'max_iter'", max_iter); - raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(), - mst_coo.weights.data(), mst_coo.n_edges, - stream); + raft::sparse::op::coo_sort_by_weight( + mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 06fffb8aed..3b6f1347ab 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -44,18 +44,24 @@ static const size_t EMPTY = 0; * @param[in] n number of columns in X * @param[in] metric distance metrix to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control * of k. The algorithm will set `k = log(n) + c` * @param[in] n_clusters number of clusters to assign data samples */ -template -void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, - size_t n, raft::distance::DistanceType metric, - linkage_output *out, int c, - size_t n_clusters) { - ASSERT(n_clusters <= m, - "n_clusters must be less than or equal to the number of data points"); +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); auto stream = handle.get_stream(); @@ -78,10 +84,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, */ rmm::device_uvector color(m, stream); raft::linkage::FixConnectivitiesRedOp op(color.data(), m); - detail::build_sorted_mst( - handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, - mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(), - indices.size(), op, metric); + detail::build_sorted_mst(handle, + X, + indptr.data(), + indices.data(), + pw_dists.data(), + m, + n, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + color.data(), + indices.size(), + op, + metric); pw_dists.release(); @@ -93,15 +109,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, rmm::device_uvector out_delta(n_edges, stream); rmm::device_uvector out_size(n_edges, stream); // Create dendrogram - detail::build_dendrogram_host( - handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges, - out->children, out_delta.data(), out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, - n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; + detail::build_dendrogram_host(handle, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + n_edges, + out->children, + out_delta.data(), + out_size.data()); + detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); + + out->m = m; + out->n_clusters = n_clusters; + out->n_leaves = m; out->n_connected_components = 1; } diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 7ed627b9e2..0c17d55762 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -40,40 +40,47 @@ namespace sparse { namespace linalg { template -__global__ void csr_add_calc_row_counts_kernel( - const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, - const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, - int *out_rowcounts) { +__global__ void csr_add_calc_row_counts_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_rowcounts) +{ // loop through columns in each set of rows and // calculate number of unique cols across both rows int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); - int *arr = new int[max_size]; + int* arr = new int[max_size]; int cur_arr_idx = 0; for (int j = a_start_idx; j < a_stop_idx; j++) { arr[cur_arr_idx] = a_indptr[j]; cur_arr_idx++; } - int arr_size = cur_arr_idx; + int arr_size = cur_arr_idx; int final_size = arr_size; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = 0; k < arr_size; k++) { if (arr[k] == cur_col) { found = true; @@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel( } } - if (!found) { - final_size++; - } + if (!found) { final_size++; } } out_rowcounts[row] = final_size; @@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel( } template -__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, - const T *a_val, int nnz1, const int *b_ind, - const int *b_indptr, const T *b_val, int nnz2, - int m, int *out_ind, int *out_indptr, - T *out_val) { +__global__ void csr_add_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + int* out_indptr, + T* out_val) +{ // 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); int o_idx = out_ind[row]; int cur_o_idx = o_idx; for (int j = a_start_idx; j < a_stop_idx; j++) { out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; + out_val[cur_o_idx] = a_val[j]; cur_o_idx++; } int arr_size = cur_o_idx - o_idx; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = o_idx; k < o_idx + arr_size; k++) { // If we found a match, sum the two values if (out_indptr[k] == cur_col) { @@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, // if we didn't find a match, add the value for b if (!found) { out_indptr[o_idx + arr_size] = cur_col; - out_val[o_idx + arr_size] = b_val[j]; + out_val[o_idx + arr_size] = b_val[j]; arr_size++; } } @@ -159,31 +172,35 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param stream: cuda stream to use */ template -size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *out_ind, - cudaStream_t stream) { +size_t csr_add_calc_inds(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); rmm::device_uvector row_counts(m + 1, stream); - CUDA_CHECK( - cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); - csr_add_calc_row_counts_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, row_counts.data()); + csr_add_calc_row_counts_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; } @@ -206,16 +223,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, * @param stream: cuda stream to use */ template -void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *c_ind, - int *c_indptr, T *c_val, cudaStream_t stream) { +void csr_add_finalize(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* c_ind, + int* c_indptr, + T* c_val, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_add_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, c_ind, c_indptr, c_val); + csr_add_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index ef6a067c39..052f674325 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -44,11 +44,10 @@ namespace linalg { * @param results array to place results */ template -__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) { +__global__ void coo_degree_kernel(const T* rows, int nnz, T* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz) { - atomicAdd(results + rows[row], (T)1); - } + if (row < nnz) { atomicAdd(results + rows[row], (T)1); } } /** @@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const T *rows, int nnz, T *results) { * @param stream: cuda stream to use */ template -void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) { +void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -77,31 +77,28 @@ void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_degree(COO *in, int *results, cudaStream_t stream) { +void coo_degree(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_kernel - <<>>(in->rows(), in->nnz, results); + coo_degree_kernel<<>>(in->rows(), in->nnz, results); CUDA_CHECK(cudaGetLastError()); } template -__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, - int *results) { +__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != 0.0) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); } } template -__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, - int nnz, T scalar, int *results) { +__global__ void coo_degree_scalar_kernel( + const int* rows, const T* vals, int nnz, T scalar, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != scalar) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, * @param stream: cuda stream to use */ template -void coo_degree_scalar(COO *in, T scalar, int *results, - cudaStream_t stream) { +void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_scalar_kernel<<>>( - in->rows(), in->vals(), in->nnz, scalar, results); + coo_degree_scalar_kernel + <<>>(in->rows(), in->vals(), in->nnz, scalar, results); CUDA_CHECK(cudaGetLastError()); } @@ -135,8 +132,9 @@ void coo_degree_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, - int *results, cudaStream_t stream = 0) { +void coo_degree_scalar( + const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); coo_degree_scalar_kernel @@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, - cudaStream_t stream) { +void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_nz_kernel - <<>>(rows, vals, nnz, results); + coo_degree_nz_kernel<<>>(rows, vals, nnz, results); } /** @@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { +void coo_degree_nz(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh index bfcd3fd592..59dc5ff3e4 100644 --- a/cpp/include/raft/sparse/linalg/norm.cuh +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ex_scan (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result) { // output array + const int* ia, // csr row ex_scan (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel( // sum all vals_arr for row and divide each val by sum if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel( for (int j = start_idx; j < stop_idx; j++) { if (sum != 0.0) { - T val = vals[j]; + T val = vals[j]; result[j] = val / sum; } else { result[j] = 0.0; @@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel( * @param stream: cuda stream to use */ template -void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) - const T *vals, +void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr - T *result, - cudaStream_t stream) { // output array + T* result, + cudaStream_t stream) +{ // output array dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_l1_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } @@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ind array (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result) { // output array + const int* ia, // csr row ind array (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel( // find max across columns and divide if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel( // divide nonzeros in current row by max for (int j = start_idx; j < stop_idx; j++) { if (max != 0.0 && max > std::numeric_limits::min()) { - T val = vals[j]; + T val = vals[j]; result[j] = val / max; } else { result[j] = 0.0; @@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel( */ template -void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) - const T *vals, +void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr - T *result, cudaStream_t stream) { + T* result, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_max_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index ce0c4bbe6f..a293e359c2 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -30,15 +30,22 @@ namespace sparse { namespace spectral { template -void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, - int nnz, int n, int n_components, T *out, - unsigned long long seed = 1234567) { +void fit_embedding(const raft::handle_t& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ auto stream = handle.get_stream(); rmm::device_uvector src_offsets(n + 1, stream); rmm::device_uvector dst_cols(nnz, stream); rmm::device_uvector dst_vals(nnz, stream); - convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), - dst_cols.data(), dst_vals.data()); + convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); rmm::device_uvector eigVals(n_components + 1, stream); rmm::device_uvector eigVecs(n * (n_components + 1), stream); @@ -52,45 +59,49 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using index_type = int; using value_type = T; - index_type *ro = src_offsets.data(); - index_type *ci = dst_cols.data(); - value_type *vs = dst_vals.data(); + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); - raft::matrix::sparse_matrix_t const r_csr_m{ - handle, ro, ci, vs, n, nnz}; + raft::matrix::sparse_matrix_t const r_csr_m{handle, ro, ci, vs, n, nnz}; - index_type neigvs = n_components + 1; - index_type maxiter = 4000; //default reset value (when set to 0); - value_type tol = 0.01; - index_type restart_iter = 15 + neigvs; //what cugraph is using + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using - raft::eigen_solver_config_t cfg{neigvs, maxiter, - restart_iter, tol}; + raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; cfg.seed = seed; raft::lanczos_solver_t eig_solver{cfg}; - //cluster computation here is irrelevant, - //hence define a no-op such solver to - //feed partition(): + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): // struct no_op_cluster_solver_t { using index_type_t = index_type; - using size_type_t = index_type; + using size_type_t = index_type; using value_type_t = value_type; - std::pair solve( - handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim, - value_type_t const *__restrict__ obs, - index_type_t *__restrict__ codes) const { + std::pair solve(handle_t const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, r_csr_m, eig_solver, - no_op_cluster_solver_t{}, labels.data(), - eigVals.data(), eigVecs.data()); + raft::spectral::partition(handle, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); raft::copy(out, eigVecs.data() + n, n * n_components, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index a6e1027288..ae89e7993c 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -47,26 +47,34 @@ namespace linalg { // TODO: value_idx param needs to be used for this once FAISS is updated to use float32 // for indices so that the index types can be uniform template -__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, - T *vals, int *orows, int *ocols, T *ovals, - int n, int cnnz, Lambda reduction_op) { +__global__ void coo_symmetrize_kernel(int* row_ind, + int* rows, + int* cols, + T* vals, + int* orows, + int* ocols, + T* ovals, + int n, + int cnnz, + Lambda reduction_op) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < n) { int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); - int row_nnz = 0; + int row_nnz = 0; int out_start_idx = start_idx * 2; for (int idx = 0; idx < stop_idx - start_idx; idx++) { int cur_row = rows[idx + start_idx]; int cur_col = cols[idx + start_idx]; - T cur_val = vals[idx + start_idx]; + T cur_val = vals[idx + start_idx]; int lookup_row = cur_col; - int t_start = row_ind[lookup_row]; // Start at - int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); T transpose = 0.0; @@ -77,7 +85,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, // done in a different thread. if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { // If it exists already, set transposed value to existing value - transpose = vals[t_idx]; + transpose = vals[t_idx]; found_match = true; break; } @@ -123,9 +131,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, COO *out, +void coo_symmetrize(COO* in, + COO* out, Lambda reduction_op, // two-argument reducer - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -137,9 +147,16 @@ void coo_symmetrize(COO *in, COO *out, out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); - coo_symmetrize_kernel<<>>( - in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), - out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); + coo_symmetrize_kernel<<>>(in_row_ind.data(), + in->rows(), + in->cols(), + in->vals(), + out->rows(), + out->cols(), + out->vals(), + in->n_rows, + in->nnz, + reduction_op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -155,14 +172,15 @@ void coo_symmetrize(COO *in, COO *out, * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction */ template -__global__ static void symmetric_find_size(const value_t *restrict data, - const value_idx *restrict indices, - const value_idx n, const int k, - value_idx *restrict row_sizes, - value_idx *restrict row_sizes2) { +__global__ static void symmetric_find_size(const value_t* restrict data, + const value_idx* restrict indices, + const value_idx n, + const int k, + value_idx* restrict row_sizes, + value_idx* restrict row_sizes2) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; const auto col = indices[row * k + j]; @@ -182,9 +200,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data, * @param row_sizes2: Input row sum 2 array(n) for faster reduction */ template -__global__ static void reduce_find_size(const value_idx n, const int k, - value_idx *restrict row_sizes, - const value_idx *restrict row_sizes2) { +__global__ static void reduce_find_size(const value_idx n, + const int k, + value_idx* restrict row_sizes, + const value_idx* restrict row_sizes2) +{ const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; if (i >= n) return; row_sizes[i] += (row_sizes2[i] + k); @@ -205,20 +225,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k, * @param k: Number of n_neighbors */ template -__global__ static void symmetric_sum(value_idx *restrict edges, - const value_t *restrict data, - const value_idx *restrict indices, - value_t *restrict VAL, - value_idx *restrict COL, - value_idx *restrict ROW, const value_idx n, - const int k) { +__global__ static void symmetric_sum(value_idx* restrict edges, + const value_t* restrict data, + const value_idx* restrict indices, + value_t* restrict VAL, + value_idx* restrict COL, + value_idx* restrict ROW, + const value_idx n, + const int k) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; - const auto col = indices[row * k + j]; - const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); const auto transpose = atomicAdd(&edges[col], value_idx(1)); VAL[transpose] = VAL[original] = data[row * k + j]; @@ -247,27 +268,25 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param out: Output COO Matrix class * @param stream: Input cuda stream */ -template -void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, - const value_t *restrict knn_dists, - const value_idx n, const int k, - COO *out, - cudaStream_t stream) { +template +void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, + const value_t* restrict knn_dists, + const value_idx n, + const int k, + COO* out, + cudaStream_t stream) +{ // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); - const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), - raft::ceildiv(k, TPB_Y)); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) rmm::device_uvector row_sizes(n, stream); - CUDA_CHECK( - cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); rmm::device_uvector row_sizes2(n, stream); - CUDA_CHECK( - cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); @@ -288,14 +307,12 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, // This mirrors CSR matrix's row Pointer, were maximum bounds for each row // are calculated as the cumulative rolling sum of the previous rows. // Notice reusing old row_sizes2 memory - value_idx *edges = row_sizes2.data(); - thrust::device_ptr __edges = thrust::device_pointer_cast(edges); - thrust::device_ptr __row_sizes = - thrust::device_pointer_cast(row_sizes.data()); + value_idx* edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, - __edges); + thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -307,9 +324,15 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, * Symmetrizes a COO matrix */ template -void symmetrize(const raft::handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *vals, size_t m, size_t n, - size_t nnz, raft::sparse::COO &out) { +void symmetrize(const raft::handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t m, + size_t n, + size_t nnz, + raft::sparse::COO& out) +{ auto stream = handle.get_stream(); // copy rows to cols and cols to rows @@ -326,13 +349,16 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream); // sort COO - raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, - symm_rows.data(), symm_cols.data(), - symm_vals.data(), stream); - - raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), - symm_cols.data(), symm_vals.data(), nnz * 2, - m, n); + raft::sparse::op::coo_sort((value_idx)m, + (value_idx)n, + (value_idx)nnz * 2, + symm_rows.data(), + symm_cols.data(), + symm_vals.data(), + stream); + + raft::sparse::op::max_duplicates( + handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 7ad4b93ec0..e3a9b1fbd9 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -55,27 +55,53 @@ namespace linalg { * @param[in] stream : Cuda stream for ordering events */ template -void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, - const value_idx *csr_indices, const value_t *csr_data, - value_idx *csc_indptr, value_idx *csc_indices, - value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, - value_idx nnz, cudaStream_t stream) { +void csr_transpose(cusparseHandle_t handle, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx* csc_indptr, + value_idx* csc_indices, + value_t* csc_data, + value_idx csr_nrows, + value_idx csr_ncols, + value_idx nnz, + cudaStream_t stream) +{ size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, + stream)); - rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, - stream); + rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, stream); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), + stream)); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh index f0d30b0cb7..36d426029b 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh @@ -28,10 +28,16 @@ namespace mst { namespace detail { template -__global__ void kernel_min_edge_per_vertex( - const edge_t* offsets, const vertex_t* indices, const alteration_t* weights, - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) { +__global__ void kernel_min_edge_per_vertex(const edge_t* offsets, + const vertex_t* indices, + const alteration_t* weights, + const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + const bool* mst_edge, + alteration_t* min_edge_color, + const vertex_t v) +{ edge_t tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned warp_id = tid / 32; @@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex( __shared__ alteration_t min_edge_weight[32]; __shared__ vertex_t min_color[32]; - min_edge_index[lane_id] = std::numeric_limits::max(); + min_edge_index[lane_id] = std::numeric_limits::max(); min_edge_weight[lane_id] = std::numeric_limits::max(); - min_color[lane_id] = std::numeric_limits::max(); + min_color[lane_id] = std::numeric_limits::max(); __syncthreads(); vertex_t self_color_idx = color_index[warp_id]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // find the minimum edge associated per row // each thread in warp holds the minimum edge for @@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex( if (warp_id < v) { // one row is associated with one warp edge_t row_start = offsets[warp_id]; - edge_t row_end = offsets[warp_id + 1]; + edge_t row_end = offsets[warp_id + 1]; // assuming one warp per row // find min for each thread in warp for (edge_t e = row_start + lane_id; e < row_end; e += 32) { alteration_t curr_edge_weight = weights[e]; - vertex_t successor_color_idx = color_index[indices[e]]; - vertex_t successor_color = color[successor_color_idx]; + vertex_t successor_color_idx = color_index[indices[e]]; + vertex_t successor_color = color[successor_color_idx]; if (!mst_edge[e] && self_color != successor_color) { if (curr_edge_weight < min_edge_weight[lane_id]) { - min_color[lane_id] = successor_color; + min_color[lane_id] = successor_color; min_edge_weight[lane_id] = curr_edge_weight; - min_edge_index[lane_id] = e; + min_edge_index[lane_id] = e; } } } @@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex( for (int offset = 16; offset > 0; offset >>= 1) { if (lane_id < offset) { if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) { - min_color[lane_id] = min_color[lane_id + offset]; + min_color[lane_id] = min_color[lane_id + offset]; min_edge_weight[lane_id] = min_edge_weight[lane_id + offset]; - min_edge_index[lane_id] = min_edge_index[lane_id + offset]; + min_edge_index[lane_id] = min_edge_index[lane_id + offset]; } } __syncthreads(); @@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex( } } -template -__global__ void min_edge_per_supervertex( - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - bool* mst_edge, const vertex_t* indices, const weight_t* weights, - const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst, - weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v, - bool symmetrize_output) { +template +__global__ void min_edge_per_supervertex(const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + bool* mst_edge, + const vertex_t* indices, + const weight_t* weights, + const alteration_t* altered_weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const alteration_t* min_edge_color, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { vertex_t vertex_color_idx = color_index[tid]; - vertex_t vertex_color = color[vertex_color_idx]; - edge_t edge_idx = new_mst_edge[tid]; + vertex_t vertex_color = color[vertex_color_idx]; + edge_t edge_idx = new_mst_edge[tid]; // check if valid outgoing edge was found // find minimum edge is same as minimum edge of whole supervertex @@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex( auto dst = indices[edge_idx]; if (!symmetrize_output) { auto dst_edge_idx = new_mst_edge[dst]; - auto dst_color = color[color_index[dst]]; + auto dst_color = color[color_index[dst]]; // vertices added each other // only if destination has found an edge // the edge points back to source // the edge is minimum edge found for dst color - if (dst_edge_idx != std::numeric_limits::max() && - indices[dst_edge_idx] == tid && + if (dst_edge_idx != std::numeric_limits::max() && indices[dst_edge_idx] == tid && min_edge_color[dst_color] == altered_weights[dst_edge_idx]) { - if (vertex_color > dst_color) { - add_edge = false; - } + if (vertex_color > dst_color) { add_edge = false; } } } if (add_edge) { - temp_src[tid] = tid; - temp_dst[tid] = dst; - temp_weights[tid] = weights[edge_idx]; + temp_src[tid] = tid; + temp_dst[tid] = dst; + temp_weights[tid] = weights[edge_idx]; mst_edge[edge_idx] = true; } } - if (!add_edge) { - new_mst_edge[tid] = std::numeric_limits::max(); - } + if (!add_edge) { new_mst_edge[tid] = std::numeric_limits::max(); } } } } @@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex( template __global__ void add_reverse_edge(const edge_t* new_mst_edge, const vertex_t* indices, - const weight_t* weights, vertex_t* temp_src, - vertex_t* temp_dst, weight_t* temp_weights, - const vertex_t v, bool symmetrize_output) { + const weight_t* weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { @@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // if vertices did not pick each other // add a reverse edge - if (tid != neighbor_vertex_neighbor) { - reverse_needed = true; - } + if (tid != neighbor_vertex_neighbor) { reverse_needed = true; } } } @@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // it is assumed the each vertex only picks one valid min edge // per cycle // hence, we store at index tid + v for the reverse edge scenario - temp_src[tid + v] = neighbor_vertex; - temp_dst[tid + v] = tid; + temp_src[tid + v] = neighbor_vertex; + temp_dst[tid + v] = tid; temp_weights[tid + v] = weights[edge_idx]; } } @@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // executes for newly added mst edges and updates the colors of both vertices to the lower color template -__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, +__global__ void min_pair_colors(const vertex_t v, + const vertex_t* indices, const edge_t* new_mst_edge, const vertex_t* color, const vertex_t* color_index, - vertex_t* next_color) { + vertex_t* next_color) +{ auto i = get_1D_idx(); if (i < v) { @@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, if (edge_idx != std::numeric_limits::max()) { vertex_t neighbor_vertex = indices[edge_idx]; // vertex_t self_color = color[i]; - vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; - vertex_t neighbor_color_idx = color_index[neighbor_vertex]; + vertex_t self_color_idx = color_index[i]; + vertex_t self_color = color[self_color_idx]; + vertex_t neighbor_color_idx = color_index[neighbor_vertex]; vertex_t neighbor_super_color = color[neighbor_color_idx]; // update my own color as source of edge @@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, // for each vertex, update color if it was changed in min_pair_colors kernel template -__global__ void update_colors(const vertex_t v, vertex_t* color, +__global__ void update_colors(const vertex_t v, + vertex_t* color, const vertex_t* color_index, - const vertex_t* next_color, bool* done) { + const vertex_t* next_color, + bool* done) +{ auto i = get_1D_idx(); if (i < v) { - vertex_t self_color = color[i]; + vertex_t self_color = color[i]; vertex_t self_color_idx = color_index[i]; - vertex_t new_color = next_color[self_color_idx]; + vertex_t new_color = next_color[self_color_idx]; // update self color to new smaller color if (self_color > new_color) { color[i] = new_color; - *done = false; + *done = false; } } } // point vertices to their final color index template -__global__ void final_color_indices(const vertex_t v, const vertex_t* color, - vertex_t* color_index) { +__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index) +{ auto i = get_1D_idx(); if (i < v) { vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // if self color is not equal to self color index, // it means self is not supervertex @@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // parent supervertex while (self_color_idx != self_color) { self_color_idx = color_index[self_color]; - self_color = color[self_color_idx]; + self_color = color[self_color_idx]; } // point to new supervertex @@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu // Consider using curand device API instead of precomputed random_values array -template -__global__ void alteration_kernel(const vertex_t v, const edge_t e, +template +__global__ void alteration_kernel(const vertex_t v, + const edge_t e, const edge_t* offsets, const vertex_t* indices, - const weight_t* weights, alteration_t max, + const weight_t* weights, + alteration_t max, alteration_t* random_values, - alteration_t* altered_weights) { + alteration_t* altered_weights) +{ auto row = get_1D_idx(); if (row < v) { auto row_begin = offsets[row]; - auto row_end = offsets[row + 1]; + auto row_end = offsets[row + 1]; for (auto i = row_begin; i < row_end; i++) { - auto column = indices[i]; - altered_weights[i] = - weights[i] + max * (random_values[row] + random_values[column]); + auto column = indices[i]; + altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]); } } } @@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e, template __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, edge_t* mst_edge_count, - const vertex_t v) { + const vertex_t v) +{ auto tid = get_1D_idx(); // count number of new mst edges added - bool predicate = - tid < v && (mst_src[tid] != std::numeric_limits::max()); + bool predicate = tid < v && (mst_src[tid] != std::numeric_limits::max()); vertex_t block_count = __syncthreads_count(predicate); - if (threadIdx.x == 0 && block_count > 0) { - atomicAdd(mst_edge_count, block_count); - } + if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } } } // namespace detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index 33b980afcd..5591e15b19 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -40,21 +40,30 @@ typedef std::chrono::high_resolution_clock Clock; // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - float* outputPtr, size_t n) { + float* outputPtr, + size_t n) +{ return curandGenerateUniform(generator, outputPtr, n); } inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - double* outputPtr, size_t n) { + double* outputPtr, + size_t n) +{ return curandGenerateUniformDouble(generator, outputPtr, n); } -template -MST_solver::MST_solver( - const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, const vertex_t v_, - const edge_t e_, vertex_t* color_, cudaStream_t stream_, - bool symmetrize_output_, bool initialize_colors_, int iterations_) +template +MST_solver::MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_) : handle(handle_), offsets(offsets_), indices(indices_), @@ -76,17 +85,17 @@ MST_solver::MST_solver( stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), - iterations(iterations_) { - max_blocks = handle_.get_device_properties().maxGridSize[0]; + iterations(iterations_) +{ + max_blocks = handle_.get_device_properties().maxGridSize[0]; max_threads = handle_.get_device_properties().maxThreadsPerBlock; - sm_count = handle_.get_device_properties().multiProcessorCount; + sm_count = handle_.get_device_properties().multiProcessorCount; mst_edge_count.set_value_to_zero_async(stream); prev_mst_edge_count.set_value_to_zero_async(stream); - CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), - stream)); + CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream)); - //Initially, color holds the vertex id as color + // Initially, color holds the vertex id as color auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); @@ -97,10 +106,10 @@ MST_solver::MST_solver( thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } -template +template raft::Graph_COO -MST_solver::solve() { +MST_solver::solve() +{ RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); RAFT_EXPECTS(offsets != nullptr, "Null offsets."); @@ -113,12 +122,13 @@ MST_solver::solve() { // Alterating the weights // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta. - // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight. + // For each edge, add noise that is less than theta. That is, generate a random number in the + // range [0.0, theta) and add it to each edge weight. alteration(); #ifdef MST_TIME auto stop = Clock::now(); - timer0 = duration_us(stop - start); + timer0 = duration_us(stop - start); #endif auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1; @@ -167,8 +177,8 @@ MST_solver::solve() { if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; - std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 - << "," << timer4 << "," << timer5 << std::endl; + std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << "," + << timer5 << std::endl; #endif // exit here when reaching steady state break; @@ -178,8 +188,7 @@ MST_solver::solve() { start = Clock::now(); #endif // append the newly found MST edges to the final output - append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), - mst_result.weights.data()); + append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data()); #ifdef MST_TIME stop = Clock::now(); timer4 += duration_us(stop - start); @@ -210,50 +219,46 @@ MST_solver::solve() { // ||y|-|x|| template struct alteration_functor { - __host__ __device__ weight_t - operator()(const thrust::tuple& t) { + __host__ __device__ weight_t operator()(const thrust::tuple& t) + { auto x = thrust::get<0>(t); auto y = thrust::get<1>(t); - x = x < 0 ? -x : x; - y = y < 0 ? -y : y; + x = x < 0 ? -x : x; + y = y < 0 ? -y : y; return x < y ? y - x : x - y; } }; // Compute the uper bound for the alteration -template -alteration_t -MST_solver::alteration_max() { +template +alteration_t MST_solver::alteration_max() +{ auto policy = handle.get_thrust_policy(); rmm::device_uvector tmp(e, stream); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); - //sort tmp weights + // sort tmp weights thrust::sort(policy, tmp.begin(), tmp.end()); - //remove duplicates + // remove duplicates auto new_end = thrust::unique(policy, tmp.begin(), tmp.end()); - //min(a[i+1]-a[i])/2 - auto begin = - thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); - auto end = - thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp.element(1, stream) - tmp.element(0, stream); - auto max = - thrust::transform_reduce(policy, begin, end, alteration_functor(), - init, thrust::minimum()); + // min(a[i+1]-a[i])/2 + auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); + auto init = tmp.element(1, stream) - tmp.element(0, stream); + auto max = thrust::transform_reduce( + policy, begin, end, alteration_functor(), init, thrust::minimum()); return max / static_cast(2); } // Compute the alteration to make all undirected edge weight unique // Preserves weights order -template -void MST_solver::alteration() { +template +void MST_solver::alteration() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); // maximum alteration that does not change realtive weights order alteration_t max = alteration_max(); @@ -270,34 +275,32 @@ void MST_solver::alteration() { auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); - RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, - "MST: CURAND cleanup failed"); + RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed"); - //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu + // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data(), - altered_weights.data()); + v, e, offsets, indices, weights, max, rand_values.data(), altered_weights.data()); } // updates colors of vertices by propagating the lower color to the higher -template -void MST_solver::label_prop( - vertex_t* mst_src, vertex_t* mst_dst) { +template +void MST_solver::label_prop(vertex_t* mst_src, + vertex_t* mst_dst) +{ // update the colors of both ends its until there is no change in colors edge_t curr_mst_edge_count = mst_edge_count.value(stream); auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); - auto min_pair_nblocks = std::min( - (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); + auto min_pair_nblocks = + std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); edge_t* new_mst_edge_ptr = new_mst_edge.data(); - vertex_t* color_ptr = color.data(); + vertex_t* color_ptr = color.data(); vertex_t* next_color_ptr = next_color.data(); rmm::device_scalar done(stream); done.set_value_to_zero_async(stream); - bool* done_ptr = done.data(); + bool* done_ptr = done.data(); const bool true_val = true; auto i = 0; @@ -312,84 +315,99 @@ void MST_solver::label_prop( i++; } - detail:: - final_color_indices<<>>( - v, color_ptr, color_index); + detail::final_color_indices<<>>( + v, color_ptr, color_index); #ifdef MST_TIME std::cout << "Label prop iterations: " << i << std::endl; #endif } // Finds the minimum edge from each vertex to the lowest color -template -void MST_solver::min_edge_per_vertex() { +template +void MST_solver::min_edge_per_vertex() +{ auto policy = handle.get_thrust_policy(); - thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), - std::numeric_limits::max()); - thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), - std::numeric_limits::max()); + thrust::fill( + policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); + thrust::fill( + policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits::max()); int n_threads = 32; - vertex_t* color_ptr = color.data(); - edge_t* new_mst_edge_ptr = new_mst_edge.data(); - bool* mst_edge_ptr = mst_edge.data(); - alteration_t* min_edge_color_ptr = min_edge_color.data(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); alteration_t* altered_weights_ptr = altered_weights.data(); - detail::kernel_min_edge_per_vertex<<>>( - offsets, indices, altered_weights_ptr, color_ptr, color_index, - new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v); + detail::kernel_min_edge_per_vertex<<>>(offsets, + indices, + altered_weights_ptr, + color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + min_edge_color_ptr, + v); } // Finds the minimum edge from each supervertex to the lowest color -template -void MST_solver::min_edge_per_supervertex() { +template +void MST_solver::min_edge_per_supervertex() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); auto policy = handle.get_thrust_policy(); - thrust::fill(policy, temp_src.begin(), temp_src.end(), - std::numeric_limits::max()); + thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits::max()); - vertex_t* color_ptr = color.data(); - edge_t* new_mst_edge_ptr = new_mst_edge.data(); - bool* mst_edge_ptr = mst_edge.data(); - alteration_t* min_edge_color_ptr = min_edge_color.data(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); alteration_t* altered_weights_ptr = altered_weights.data(); - vertex_t* temp_src_ptr = temp_src.data(); - vertex_t* temp_dst_ptr = temp_dst.data(); - weight_t* temp_weights_ptr = temp_weights.data(); - - detail::min_edge_per_supervertex<<>>( - color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, - altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr, - min_edge_color_ptr, v, symmetrize_output); + vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_dst_ptr = temp_dst.data(); + weight_t* temp_weights_ptr = temp_weights.data(); + + detail::min_edge_per_supervertex<<>>(color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + indices, + weights, + altered_weights_ptr, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + min_edge_color_ptr, + v, + symmetrize_output); // the above kernel only adds directed mst edges in the case where // a pair of vertices don't pick the same min edge between them // so, now we add the reverse edge to make it undirected if (symmetrize_output) { - detail::add_reverse_edge<<>>( - new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr, - temp_weights_ptr, v, symmetrize_output); + detail::add_reverse_edge<<>>(new_mst_edge_ptr, + indices, + weights, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + v, + symmetrize_output); } } -template -void MST_solver::check_termination() { +template +void MST_solver::check_termination() +{ vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads); - vertex_t nblocks = - std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); + vertex_t nblocks = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges edge_t* mst_edge_count_ptr = mst_edge_count.data(); - vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_src_ptr = temp_src.data(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -397,36 +415,40 @@ void MST_solver::check_termination() { template struct new_edges_functor { - __host__ __device__ bool operator()( - const thrust::tuple& t) { + __host__ __device__ bool operator()(const thrust::tuple& t) + { auto src = thrust::get<0>(t); return src != std::numeric_limits::max() ? true : false; } }; -template +template void MST_solver::append_src_dst_pair( - vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { + vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) +{ auto policy = handle.get_thrust_policy(); edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream); // iterator to end of mst edges added to final output in previous iteration - auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( - mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count, - mst_weights + curr_mst_edge_count)); + auto src_dst_zip_end = + thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count, + mst_dst + curr_mst_edge_count, + mst_weights + curr_mst_edge_count)); // iterator to new mst edges found - auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple( - temp_src.begin(), temp_dst.begin(), temp_weights.begin())); + auto temp_src_dst_zip_begin = thrust::make_zip_iterator( + thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin())); auto temp_src_dst_zip_end = thrust::make_zip_iterator( thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end())); // copy new mst edges to final output - thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end, - src_dst_zip_end, new_edges_functor()); + thrust::copy_if(policy, + temp_src_dst_zip_begin, + temp_src_dst_zip_end, + src_dst_zip_end, + new_edges_functor()); } } // namespace mst diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 4d5ca6ebe1..97a76e1d50 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -26,32 +26,29 @@ namespace mst { namespace detail { template -__device__ idx_t get_1D_idx() { +__device__ idx_t get_1D_idx() +{ return blockIdx.x * blockDim.x + threadIdx.x; } // somewhat smart vector print template -void printv(rmm::device_uvector& vec, const std::string& name = "", - const size_t displ = 5) { +void printv(rmm::device_uvector& vec, const std::string& name = "", const size_t displ = 5) +{ #ifdef MST_TIME std::cout.precision(15); std::cout << name << " size = " << vec.size() << std::endl; if (displ < vec.size()) { - thrust::copy(vec.begin(), vec.begin() + displ, - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator(std::cout, " ")); std::cout << " ... "; - thrust::copy(vec.end() - displ, vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator(std::cout, " ")); } else { - thrust::copy(vec.begin(), vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.end(), std::ostream_iterator(std::cout, " ")); } std::cout << std::endl << std::endl; #endif } -#define duration_us(a) \ - std::chrono::duration_cast(a).count() +#define duration_us(a) std::chrono::duration_cast(a).count() } // namespace detail } // namespace mst diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index 10c981445e..b49003467b 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -22,16 +22,30 @@ namespace raft { namespace mst { -template -raft::Graph_COO mst( - const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices, - weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color, - cudaStream_t stream, bool symmetrize_output = true, - bool initialize_colors = true, int iterations = 0) { - MST_solver mst_solver( - handle, offsets, indices, weights, v, e, color, stream, symmetrize_output, - initialize_colors, iterations); +template +raft::Graph_COO mst(const raft::handle_t& handle, + edge_t const* offsets, + vertex_t const* indices, + weight_t const* weights, + vertex_t const v, + edge_t const e, + vertex_t* color, + cudaStream_t stream, + bool symmetrize_output = true, + bool initialize_colors = true, + int iterations = 0) +{ + MST_solver mst_solver(handle, + offsets, + indices, + weights, + v, + e, + color, + stream, + symmetrize_output, + initialize_colors, + iterations); return mst_solver.solve(); } diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 44b34ee5c7..bae5d77d8e 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -31,20 +31,27 @@ struct Graph_COO { edge_t n_edges; Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) {} + : src(size, stream), dst(size, stream), weights(size, stream) + { + } }; namespace mst { -template +template class MST_solver { public: - MST_solver(const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, - const vertex_t v_, const edge_t e_, vertex_t* color_, - cudaStream_t stream_, bool symmetrize_output_, - bool initialize_colors_, int iterations_); + MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_); raft::Graph_COO solve(); @@ -56,7 +63,7 @@ class MST_solver { bool symmetrize_output, initialize_colors; int iterations; - //CSR + // CSR const edge_t* offsets; const vertex_t* indices; const weight_t* weights; @@ -67,20 +74,16 @@ class MST_solver { vertex_t max_threads; vertex_t sm_count; - vertex_t* color_index; // represent each supervertex as a color - rmm::device_uvector - min_edge_color; // minimum incident edge weight per color - rmm::device_uvector new_mst_edge; // new minimum edge per vertex - rmm::device_uvector - altered_weights; // weights to be used for mst + vertex_t* color_index; // represent each supervertex as a color + rmm::device_uvector min_edge_color; // minimum incident edge weight per color + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration rmm::device_scalar - mst_edge_count; // total number of edges added after every iteration - rmm::device_scalar - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_uvector - mst_edge; // mst output - true if the edge belongs in mst + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst rmm::device_uvector next_color; // next iteration color - rmm::device_uvector color; // index of color that vertex points to + rmm::device_uvector color; // index of color that vertex points to // new src-dst pairs found per iteration rmm::device_uvector temp_src; @@ -93,8 +96,7 @@ class MST_solver { void check_termination(); void alteration(); alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, - weight_t* mst_weights); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); }; } // namespace mst diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 492058f85f..8bc8c746f9 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -42,15 +42,23 @@ namespace sparse { namespace op { template -__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, - const T *vals, int nnz, int *crows, - int *ccols, T *cvals, int *ex_scan, - int *cur_ex_scan, int m, T scalar) { +__global__ void coo_remove_scalar_kernel(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* ex_scan, + int* cur_ex_scan, + int m, + T scalar) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { - int start = cur_ex_scan[row]; - int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); int cur_out_idx = ex_scan[row]; for (int idx = start; idx < stop; idx++) { @@ -82,35 +90,49 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, * @param stream: cuda stream to use */ template -void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, cudaStream_t stream) { +void coo_remove_scalar(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* cnnz, + int* cur_cnnz, + T scalar, + int n, + cudaStream_t stream) +{ rmm::device_uvector ex_scan(n, stream); rmm::device_uvector cur_ex_scan(n, stream); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); - thrust::device_ptr dev_ex_scan = - thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, - dev_ex_scan); + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = - thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, - dev_cur_cnnz + n, dev_cur_ex_scan); + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - coo_remove_scalar_kernel<<>>( - rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), - dev_cur_ex_scan.get(), n, scalar); + coo_remove_scalar_kernel<<>>(rows, + cols, + vals, + nnz, + crows, + ccols, + cvals, + dev_ex_scan.get(), + dev_cur_ex_scan.get(), + n, + scalar); CUDA_CHECK(cudaPeekAtLastError()); } @@ -123,33 +145,39 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { +void coo_remove_scalar(COO* in, COO* out, T scalar, cudaStream_t stream) +{ rmm::device_uvector row_count_nz(in->n_rows, stream); rmm::device_uvector row_count(in->n_rows, stream); - CUDA_CHECK( - cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, - row_count_nz.data(), stream); + linalg::coo_degree_scalar( + in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr d_row_count_nz = - thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, - d_row_count_nz + in->n_rows); + thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = + thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); - coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, - out->rows(), out->cols(), out->vals(), - row_count_nz.data(), row_count.data(), scalar, - in->n_rows, stream); + coo_remove_scalar(in->rows(), + in->cols(), + in->vals(), + in->nnz, + out->rows(), + out->cols(), + out->vals(), + row_count_nz.data(), + row_count.data(), + scalar, + in->n_rows, + stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -161,7 +189,8 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, cudaStream_t stream) { +void coo_remove_zeros(COO* in, COO* out, cudaStream_t stream) +{ coo_remove_scalar(in, out, T(0.0), stream); } diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 09a35720fb..84d584d108 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -44,25 +44,29 @@ namespace sparse { namespace op { template -__global__ void compute_duplicates_diffs_kernel(const value_idx *rows, - const value_idx *cols, - value_idx *diff, size_t nnz) { +__global__ void compute_duplicates_diffs_kernel(const value_idx* rows, + const value_idx* cols, + value_idx* diff, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; value_idx d = 1; - if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) - d = 0; + if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0; diff[tid] = d; } template -__global__ void max_duplicates_kernel(const value_idx *src_rows, - const value_idx *src_cols, - const value_t *src_vals, - const value_idx *index, - value_idx *out_rows, value_idx *out_cols, - value_t *out_vals, size_t nnz) { +__global__ void max_duplicates_kernel(const value_idx* src_rows, + const value_idx* src_cols, + const value_t* src_vals, + const value_idx* index, + value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < nnz) { @@ -94,13 +98,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void compute_duplicates_mask(value_idx *mask, const value_idx *rows, - const value_idx *cols, size_t nnz, - cudaStream_t stream) { +void compute_duplicates_mask( + value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) +{ CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); - compute_duplicates_diffs_kernel<<>>(rows, cols, mask, nnz); + compute_duplicates_diffs_kernel<<>>( + rows, cols, mask, nnz); } /** @@ -120,11 +124,16 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void max_duplicates(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_idx *rows, const value_idx *cols, - const value_t *vals, size_t nnz, size_t m, size_t n) { - auto stream = handle.get_stream(); +void max_duplicates(const raft::handle_t& handle, + raft::sparse::COO& out, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t nnz, + size_t m, + size_t n) +{ + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // compute diffs & take exclusive scan @@ -132,8 +141,7 @@ void max_duplicates(const raft::handle_t &handle, compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), - diff.data()); + thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index 9e5034dc28..194a878ac1 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -38,12 +38,12 @@ namespace sparse { namespace op { template void> -__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, - Lambda op) { +__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) +{ T row = blockIdx.x * TPB_X + threadIdx.x; if (row < n_rows) { T start_idx = row_ind[row]; - T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; op(row, start_idx, stop_idx); } } @@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template void> -void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, - cudaStream_t stream) { +template void> +void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) +{ dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel - <<>>(row_ind, n_rows, nnz, op); + csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h index 46f4f41879..9bbe04cf34 100644 --- a/cpp/include/raft/sparse/op/slice.h +++ b/cpp/include/raft/sparse/op/slice.h @@ -50,10 +50,14 @@ namespace op { * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, - const value_idx *indptr, value_idx *indptr_out, - value_idx *start_offset, value_idx *stop_offset, - cudaStream_t stream) { +void csr_row_slice_indptr(value_idx start_row, + value_idx stop_row, + const value_idx* indptr, + value_idx* indptr_out, + value_idx* start_offset, + value_idx* stop_offset, + cudaStream_t stream) +{ raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); @@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, // we add another 1 to stop row. - raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, - stream); + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream); raft::linalg::unaryOp( - indptr_out, indptr_out, (stop_row + 2) - start_row, + indptr_out, + indptr_out, + (stop_row + 2) - start_row, [s_offset] __device__(value_idx input) { return input - s_offset; }, stream); } @@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, - const value_idx *indices, const value_t *data, - value_idx *indices_out, value_t *data_out, - cudaStream_t stream) { - raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, - stream); +void csr_row_slice_populate(value_idx start_offset, + value_idx stop_offset, + const value_idx* indices, + const value_t* data, + value_idx* indices_out, + value_t* data_out, + cudaStream_t stream) +{ + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream); raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); } diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index c40801a0b1..d397bce780 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -38,7 +38,8 @@ namespace op { struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -61,13 +62,12 @@ struct TupleComp { * @param stream: cuda stream to use */ template -void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, - cudaStream_t stream) { +void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream) +{ auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, - vals, TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, vals, TupleComp()); } /** @@ -77,9 +77,9 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, cudaStream_t stream) { - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), stream); +void coo_sort(COO* const in, cudaStream_t stream) +{ + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream); } /** @@ -93,8 +93,9 @@ void coo_sort(COO *const in, cudaStream_t stream) { * @param[in] stream cuda stream for which to order cuda operations */ template -void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, - value_idx nnz, cudaStream_t stream) { +void coo_sort_by_weight( + value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) +{ thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 5313b81192..8edb0e8b43 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -59,17 +59,20 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructor - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const &key, - Value const &value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -83,31 +86,32 @@ struct KeyValuePair { */ template struct FixConnectivitiesRedOp { - value_idx *colors; + value_idx* colors; value_idx m; - FixConnectivitiesRedOp(value_idx *colors_, value_idx m_) - : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; typedef typename cub::KeyValuePair KVP; - DI void operator()(value_idx rit, KVP *out, const KVP &other) { - if (rit < m && other.value < out->value && - colors[rit] != colors[other.key]) { - out->key = other.key; + DI void operator()(value_idx rit, KVP* out, const KVP& other) + { + if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { + out->key = other.key; out->value = other.value; } } - DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) { + DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) + { if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { return a; } else return b; } - DI void init(value_t *out, value_t maxVal) { *out = maxVal; } - DI void init(KVP *out, value_t maxVal) { - out->key = -1; + DI void init(value_t* out, value_t maxVal) { *out = maxVal; } + DI void init(KVP* out, value_t maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp { */ struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -137,13 +142,9 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce @@ -158,11 +159,10 @@ struct CubKVPMinReduce { * @return total number of components */ template -value_idx get_n_components(value_idx *colors, size_t n_rows, - cudaStream_t stream) { +value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream) +{ rmm::device_uvector map_ids(0, stream); - int num_clusters = - raft::label::getUniquelabels(map_ids, colors, n_rows, stream); + int num_clusters = raft::label::getUniquelabels(map_ids, colors, n_rows, stream); return num_clusters; } @@ -173,11 +173,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows, */ template struct LookupColorOp { - value_idx *colors; + value_idx* colors; - LookupColorOp(value_idx *colors_) : colors(colors_) {} + LookupColorOp(value_idx* colors_) : colors(colors_) {} - DI value_idx operator()(const cub::KeyValuePair &kvp) { + DI value_idx operator()(const cub::KeyValuePair& kvp) + { return colors[kvp.key]; } }; @@ -187,7 +188,8 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t - * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components + * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given + * array of components * @param[out] nn_colors components of nearest neighbors for each vertex * @param[in] colors components of each vertex * @param[in] X original dense data @@ -196,24 +198,38 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair *kvp, - value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, cudaStream_t stream, - red_op reduction_op) { +void perform_1nn(cub::KeyValuePair* kvp, + value_idx* nn_colors, + value_idx* colors, + const value_t* X, + size_t n_rows, + size_t n_cols, + cudaStream_t stream, + red_op reduction_op) +{ rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, - true, stream); - - raft::distance::fusedL2NN, - value_idx>( - kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols, - workspace.data(), reduction_op, reduction_op, true, true, stream); + raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); + + raft::distance::fusedL2NN, value_idx>( + kvp, + X, + X, + x_norm.data(), + x_norm.data(), + n_rows, + n_rows, + n_cols, + workspace.data(), + reduction_op, + reduction_op, + true, + true, + stream); LookupColorOp extract_colors_op(colors); - thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, - extract_colors_op); + thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } /** @@ -229,27 +245,33 @@ void perform_1nn(cub::KeyValuePair *kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx *colors, value_idx *nn_colors, - cub::KeyValuePair *kvp, - value_idx *src_indices, size_t n_rows, cudaStream_t stream) { +void sort_by_color(value_idx* colors, + value_idx* nn_colors, + cub::KeyValuePair* kvp, + value_idx* src_indices, + size_t n_rows, + cudaStream_t stream) +{ thrust::counting_iterator arg_sort_iter(0); - thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, - src_indices); + thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); - auto keys = thrust::make_zip_iterator(thrust::make_tuple( - colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair*)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, - TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); } template -__global__ void min_components_by_color_kernel( - value_idx *out_rows, value_idx *out_cols, value_t *out_vals, - const value_idx *out_index, const value_idx *indices, - const cub::KeyValuePair *kvp, size_t nnz) { +__global__ void min_components_by_color_kernel(value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -278,19 +300,20 @@ __global__ void min_components_by_color_kernel( * @param[in] stream cuda stream for which to order cuda operations */ template -void min_components_by_color(raft::sparse::COO &coo, - const value_idx *out_index, - const value_idx *indices, - const cub::KeyValuePair *kvp, - size_t nnz, cudaStream_t stream) { +void min_components_by_color(raft::sparse::COO& coo, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz, + cudaStream_t stream) +{ /** * Arrays should be ordered by: colors_indptr->colors_n->kvp.value * so the last element of each column in the input CSR should be * the min. */ - min_components_by_color_kernel<<>>(coo.rows(), coo.cols(), coo.vals(), - out_index, indices, kvp, nnz); + min_components_by_color_kernel<<>>( + coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz); } /** @@ -312,12 +335,16 @@ void min_components_by_color(raft::sparse::COO &coo, * @param[in] n_cols number of cols in X */ template -void connect_components(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_t *X, const value_idx *orig_colors, - size_t n_rows, size_t n_cols, red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_components( + const raft::handle_t& handle, + raft::sparse::COO& out, + const value_t* X, + const value_idx* orig_colors, + size_t n_rows, + size_t n_cols, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, @@ -328,8 +355,7 @@ void connect_components(const raft::handle_t &handle, raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - true); + raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, true); value_idx n_components = get_n_components(colors.data(), n_rows, stream); @@ -338,36 +364,42 @@ void connect_components(const raft::handle_t &handle, * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists( - n_rows, stream); + rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, stream, reduction_op); + perform_1nn(temp_inds_dists.data(), + nn_colors.data(), + colors.data(), + X, + n_rows, + n_cols, + stream, + reduction_op); /** * Sort data points by color (neighbors are not sorted) */ // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors - sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(), - src_indices.data(), n_rows, stream); + sort_by_color( + colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); /** * Take the min for any duplicate colors */ // Compute mask of duplicates rmm::device_uvector out_index(n_rows + 1, stream); - raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), - nn_colors.data(), n_rows, stream); + raft::sparse::op::compute_duplicates_mask( + out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(), - out_index.data() + out_index.size(), out_index.data()); + thrust::exclusive_scan(handle.get_thrust_policy(), + out_index.data(), + out_index.data() + out_index.size(), + out_index.data()); // compute final size value_idx size = 0; - raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, - stream); + raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size++; @@ -375,14 +407,14 @@ void connect_components(const raft::handle_t &handle, raft::sparse::COO min_edges(stream); min_edges.allocate(size, n_rows, n_rows, true, stream); - min_components_by_color(min_edges, out_index.data(), src_indices.data(), - temp_inds_dists.data(), n_rows, stream); + min_components_by_color( + min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream); /** * Symmetrize resulting edge list */ - raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(), - min_edges.vals(), n_rows, n_rows, size, out); + raft::sparse::linalg::symmetrize( + handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); } }; // end namespace linkage diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index b796b63dc8..8486abd863 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -38,9 +38,11 @@ namespace selection { template struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, value_idx n_rows, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data) + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) : batch_start_(0), batch_stop_(0), batch_rows_(0), @@ -50,32 +52,42 @@ struct csr_batcher_t { csr_indices_(csr_indices), csr_data_(csr_data), batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) {} + batch_csr_stop_offset_(0) + { + } - void set_batch(int batch_num) { + void set_batch(int batch_num) + { batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - if (batch_stop_ >= total_rows_) - batch_stop_ = total_rows_ - 1; // zero-based indexing + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing batch_rows_ = (batch_stop_ - batch_start_) + 1; } - value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_indptr( - batch_start_, batch_stop_, csr_indptr_, batch_indptr, - &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); return batch_csr_stop_offset_ - batch_csr_start_offset_; } - void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_populate( - batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, - csr_indices, csr_data, stream); + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); } value_idx batch_rows() const { return batch_rows_; } @@ -92,9 +104,9 @@ struct csr_batcher_t { value_idx total_rows_; - const value_idx *csr_indptr_; - const value_idx *csr_indices_; - const value_t *csr_data_; + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; value_idx batch_csr_start_offset_; value_idx batch_csr_stop_offset_; @@ -103,18 +115,26 @@ struct csr_batcher_t { template class sparse_knn_t { public: - sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, - const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, - int n_idx_cols_, const value_idx *queryIndptr_, - const value_idx *queryIndices_, const value_t *queryData_, - size_t queryNNZ_, int n_query_rows_, int n_query_cols_, - value_idx *output_indices_, value_t *output_dists_, int k_, - const raft::handle_t &handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = - raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + const raft::handle_t& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0) : idxIndptr(idxIndptr_), idxIndices(idxIndices_), idxData(idxData_), @@ -134,9 +154,12 @@ class sparse_knn_t { batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), metric(metric_), - metricArg(metricArg_) {} + metricArg(metricArg_) + { + } - void run() { + void run() + { using namespace raft::sparse; int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); @@ -147,37 +170,33 @@ class sparse_knn_t { for (int i = 0; i < n_batches_query; i++) { /** - * Compute index batch info - */ + * Compute index batch info + */ query_batcher.set_batch(i); /** - * Slice CSR to rows in batch - */ + * Slice CSR to rows in batch + */ - rmm::device_uvector query_batch_indptr( - query_batcher.batch_rows() + 1, handle.get_stream()); + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + handle.get_stream()); - value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( - query_batch_indptr.data(), handle.get_stream()); + value_idx n_query_batch_nnz = + query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream()); - rmm::device_uvector query_batch_indices(n_query_batch_nnz, - handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, - handle.get_stream()); + rmm::device_uvector query_batch_indices(n_query_batch_nnz, handle.get_stream()); + rmm::device_uvector query_batch_data(n_query_batch_nnz, handle.get_stream()); - query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), - query_batch_data.data(), - handle.get_stream()); + query_batcher.get_batch_csr_indices_data( + query_batch_indices.data(), query_batch_data.data(), handle.get_stream()); // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, - handle.get_stream()); + rmm::device_uvector merge_buffer_indices(0, handle.get_stream()); rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - value_t *dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_ptr; + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); csr_batcher_t idx_batcher( @@ -186,22 +205,19 @@ class sparse_knn_t { for (int j = 0; j < n_batches_idx; j++) { idx_batcher.set_batch(j); - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr( - idx_batcher.batch_rows() + 1, handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, - handle.get_stream()); + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + handle.get_stream()); + rmm::device_uvector idx_batch_indices(0, handle.get_stream()); rmm::device_uvector idx_batch_data(0, handle.get_stream()); - value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( - idx_batch_indptr.data(), handle.get_stream()); + value_idx idx_batch_nnz = + idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream()); idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); @@ -210,111 +226,126 @@ class sparse_knn_t { idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); /** - * Compute distances - */ - size_t dense_size = - idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, - handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, - batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, query_batcher, idx_batch_nnz, - n_query_batch_nnz, idx_batch_indptr.data(), - idx_batch_indices.data(), idx_batch_data.data(), - query_batch_indptr.data(), query_batch_indices.data(), - query_batch_data.data(), batch_dists.data()); + * Compute distances + */ + size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, handle.get_stream()); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), - handle.get_stream()); + rmm::device_uvector batch_indices(batch_dists.size(), handle.get_stream()); // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); - iota_fill(batch_indices.data(), batch_rows, batch_cols, - handle.get_stream()); + iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream()); /** * Perform k-selection on batch & merge with other k-selections */ size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = - merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = - merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), - batch_indices.data(), dists_merge_buffer_ptr, + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, indices_merge_buffer_ptr); - value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; // Merge results of difference batches if necessary if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = - merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = - merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), - merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, indices_merge_buffer_tmp_ptr); } // copy merged output back into merge buffer partition for next iteration raft::copy_async(merge_buffer_indices.data(), indices_merge_buffer_tmp_ptr, - batch_rows * k, handle.get_stream()); + batch_rows * k, + handle.get_stream()); raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, batch_rows * k, + dists_merge_buffer_tmp_ptr, + batch_rows * k, handle.get_stream()); } // Copy final merged batch to output array - raft::copy_async( - output_indices + (rows_processed * k), merge_buffer_indices.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - raft::copy_async( - output_dists + (rows_processed * k), merge_buffer_dists.data(), - query_batcher.batch_rows() * k, handle.get_stream()); + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); rows_processed += query_batcher.batch_rows(); } } private: - void merge_batches(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - value_t *merge_buffer_dists, - value_idx *merge_buffer_indices, value_t *out_dists, - value_idx *out_indices) { + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { // build translation buffer to shift resulting indices by the batch std::vector id_ranges; id_ranges.push_back(0); id_ranges.push_back(idx_batcher.batch_start()); rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), - handle.get_stream()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream()); // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts( - merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, - query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); + raft::spatial::knn::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + handle.get_stream(), + trans.data()); } void perform_k_selection(csr_batcher_t idx_batcher, csr_batcher_t query_batcher, - value_t *batch_dists, value_idx *batch_indices, - value_t *out_dists, value_idx *out_indices) { + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); // build translation buffer to shift resulting indices by the batch std::vector id_ranges; @@ -329,52 +360,60 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows, - batch_cols, out_dists, out_indices, ascending, - n_neighbors, handle.get_stream()); + raft::spatial::knn::select_k(batch_dists, + batch_indices, + batch_rows, + batch_cols, + out_dists, + out_indices, + ascending, + n_neighbors, + handle.get_stream()); } - void compute_distances(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - size_t idx_batch_nnz, size_t query_batch_nnz, - value_idx *idx_batch_indptr, - value_idx *idx_batch_indices, value_t *idx_batch_data, - value_idx *query_batch_indptr, - value_idx *query_batch_indices, - value_t *query_batch_data, value_t *batch_dists) { + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config( - handle); + raft::sparse::distance::distances_config_t dist_config(handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; + dist_config.b_nnz = idx_batch_nnz; - dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indptr = idx_batch_indptr; dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; + dist_config.b_data = idx_batch_data; dist_config.a_nrows = query_batcher.batch_rows(); dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; + dist_config.a_nnz = query_batch_nnz; - dist_config.a_indptr = query_batch_indptr; + dist_config.a_indptr = query_batch_indptr; dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; + dist_config.a_data = query_batch_data; if (raft::sparse::distance::supportedDistance.find(metric) == raft::sparse::distance::supportedDistance.end()) THROW("DistanceType not supported: %d", metric); - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, - metricArg); + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); } const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx *output_indices; + value_idx* output_indices; const value_t *idxData, *queryData; - value_t *output_dists; + value_t* output_dists; size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; @@ -384,50 +423,74 @@ class sparse_knn_t { int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - const raft::handle_t &handle; + const raft::handle_t& handle; }; /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ template -void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, - const value_t *idxData, size_t idxNNZ, int n_idx_rows, - int n_idx_cols, const value_idx *queryIndptr, - const value_idx *queryIndices, const value_t *queryData, - size_t queryNNZ, int n_query_rows, int n_query_cols, - value_idx *output_indices, value_t *output_dists, int k, - const raft::handle_t &handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - sparse_knn_t( - idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, - queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, handle, batch_size_index, batch_size_query, - metric, metricArg) +void brute_force_knn(const value_idx* idxIndptr, + const value_idx* idxIndices, + const value_t* idxData, + size_t idxNNZ, + int n_idx_rows, + int n_idx_cols, + const value_idx* queryIndptr, + const value_idx* queryIndices, + const value_t* queryData, + size_t queryNNZ, + int n_query_rows, + int n_query_cols, + value_idx* output_indices, + value_t* output_dists, + int k, + const raft::handle_t& handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + sparse_knn_t(idxIndptr, + idxIndices, + idxData, + idxNNZ, + n_idx_rows, + n_idx_cols, + queryIndptr, + queryIndices, + queryData, + queryNNZ, + n_query_rows, + n_query_cols, + output_indices, + output_dists, + k, + handle, + batch_size_index, + batch_size_query, + metric, + metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 3df1c77081..f13c43c306 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -45,31 +45,34 @@ namespace selection { * @param m */ template -__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) { +__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz) +{ value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; - value_idx v = tid / m; + value_idx v = tid / m; indices[tid] = v; } template -value_idx build_k(value_idx n_samples, int c) { +value_idx build_k(value_idx n_samples, int c) +{ // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering // approach on GPU" - return min(n_samples, - max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); + return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); } template -__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) { +__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; - out_t v = inds[tid]; + out_t v = inds[tid]; out[tid] = v; } template -void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { +void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) +{ size_t blocks = ceildiv(size, (size_t)tpb); conv_indices_kernel<<>>(inds, out, size); } @@ -92,9 +95,14 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { * @param c */ template -void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, +void knn_graph(const handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - raft::sparse::COO &out, int c = 15) { + raft::sparse::COO& out, + int c = 15) +{ int k = build_k(m, c); auto stream = handle.get_stream(); @@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, size_t blocks = ceildiv(nnz, (size_t)256); fill_indices<<>>(rows.data(), k, nnz); - std::vector inputs; - inputs.push_back(const_cast(X)); + std::vector inputs; + inputs.push_back(const_cast(X)); std::vector sizes; sizes.push_back(m); @@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, rmm::device_uvector int64_indices(nnz, stream); uint32_t knn_start = curTimeMillis(); - raft::spatial::knn::brute_force_knn( - handle, inputs, sizes, n, const_cast(X), m, int64_indices.data(), - data.data(), k, true, true, nullptr, metric); + raft::spatial::knn::brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(X), + m, + int64_indices.data(), + data.data(), + k, + true, + true, + nullptr, + metric); // convert from current knn's 64-bit to 32-bit. conv_indices(int64_indices.data(), indices.data(), nnz, stream); - raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(), - data.data(), m, k, nnz, out); + raft::sparse::linalg::symmetrize( + handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); } }; // namespace selection diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h index 63578bf1f3..56e8832e0a 100644 --- a/cpp/include/raft/sparse/utils.h +++ b/cpp/include/raft/sparse/utils.h @@ -26,7 +26,8 @@ namespace sparse { * @param[in] ncols number of blocks to quantize */ template -inline int block_dim(value_idx ncols) { +inline int block_dim(value_idx ncols) +{ int blockdim; if (ncols <= 32) blockdim = 32; @@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) { * @return */ template -__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, - G key) { - unsigned int mask = __ballot_sync(init_mask, true); +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key) +{ + unsigned int mask = __ballot_sync(init_mask, true); unsigned int peer_group = 0; bool is_peer; @@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, } #endif -__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ return __ffs(peer_group) - 1; } template -__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { +__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) +{ int row = blockIdx.x; int tid = threadIdx.x; @@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { } template -void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, - cudaStream_t stream) { +void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream) +{ int blockdim = block_dim(ncols); iota_fill_block_kernel<<>>(indices, ncols); } template -__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { +__device__ int get_stop_idx(T row, T m, T nnz, const T* ind) +{ int stop_idx = 0; if (row < (m - 1)) stop_idx = ind[row + 1]; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 2cdf9bf4f5..e8cc85256d 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -42,14 +42,16 @@ namespace knn { * @param[in] D the dimensionality of the index array */ template -inline void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - knnIndexParam *params, +inline void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, - value_idx n, value_idx D) { - detail::approx_knn_build_index(handle, index, params, metric, metricArg, - index_array, n, D); + float metricArg, + float* index_array, + value_idx n, + value_idx D) +{ + detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } /** @@ -66,12 +68,15 @@ inline void approx_knn_build_index(raft::handle_t &handle, * @param[in] n number of rows in the query array */ template -inline void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, - raft::spatial::knn::knnIndex *index, value_idx k, - float *query_array, value_idx n) { - detail::approx_knn_search(handle, distances, indices, index, k, query_array, - n); +inline void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + value_idx k, + float* query_array, + value_idx n) +{ + detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } } // namespace knn diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 6a6c7751c2..573a23181d 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -26,13 +26,14 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex *index; + faiss::gpu::GpuIndex* index; raft::distance::DistanceType metric; float metricArg; - faiss::gpu::StandardGpuResources *gpu_res; + faiss::gpu::StandardGpuResources* gpu_res; int device; - ~knnIndex() { + ~knnIndex() + { delete index; delete gpu_res; } @@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam { int nprobe; }; -struct IVFFlatParam : IVFParam {}; +struct IVFFlatParam : IVFParam { +}; struct IVFPQParam : IVFParam { int M; diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp index a98473f186..cb2b9e99cd 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.hpp +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -28,12 +28,11 @@ namespace raft { namespace spatial { namespace knn { -template -void rbc_build_index(const raft::handle_t &handle, - BallCoverIndex &index) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_build_index(const raft::handle_t& handle, + BallCoverIndex& index) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { detail::rbc_build_index(handle, index, detail::HaversineFunc()); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || @@ -74,23 +73,23 @@ void rbc_build_index(const raft::handle_t &handle, * many datasets can still have great recall even by only * looking in the closest landmark. */ -template -void rbc_all_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, value_idx *inds, value_t *dists, - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_all_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + value_idx* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { - detail::rbc_all_knn_query(handle, index, k, inds, dists, - detail::HaversineFunc(), perform_post_filtering, - weight); + detail::rbc_all_knn_query( + handle, index, k, inds, dists, detail::HaversineFunc(), perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { - detail::rbc_all_knn_query(handle, index, k, inds, dists, - detail::EuclideanFunc(), perform_post_filtering, - weight); + detail::rbc_all_knn_query( + handle, index, k, inds, dists, detail::EuclideanFunc(), perform_post_filtering, weight); } else { RAFT_FAIL("Metric not supported"); } @@ -127,23 +126,40 @@ void rbc_all_knn_query(const raft::handle_t &handle, * looking in the closest landmark. * @param[in] n_query_pts number of query points */ -template -void rbc_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, const value_t *query, value_int n_query_pts, - value_idx *inds, value_t *dists, - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + const value_t* query, + value_int n_query_pts, + value_idx* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { - detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, - detail::HaversineFunc(), perform_post_filtering, + detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + detail::HaversineFunc(), + perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { - detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, - detail::EuclideanFunc(), perform_post_filtering, + detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + detail::EuclideanFunc(), + perform_post_filtering, weight); } else { RAFT_FAIL("Metric not supported"); diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index ca614bb0cb..e38124edb6 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -34,12 +34,13 @@ namespace knn { * @tparam value_t * @tparam value_int */ -template +template class BallCoverIndex { public: - explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_, - value_int m_, value_int n_, + explicit BallCoverIndex(const raft::handle_t& handle_, + const value_t* X_, + value_int m_, + value_int n_, raft::distance::DistanceType metric_) : handle(handle_), X(X_), @@ -47,37 +48,39 @@ class BallCoverIndex { n(n_), metric(metric_), /** - * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound - * - * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) - */ + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ n_landmarks(sqrt(m_)), R_indptr(sqrt(m_) + 1, handle.get_stream()), R_1nn_cols(m_, handle.get_stream()), R_1nn_dists(m_, handle.get_stream()), R(sqrt(m_) * n_, handle.get_stream()), R_radius(sqrt(m_), handle.get_stream()), - index_trained(false) {} + index_trained(false) + { + } - value_idx *get_R_indptr() { return R_indptr.data(); } - value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); } - value_t *get_R_1nn_dists() { return R_1nn_dists.data(); } - value_t *get_R_radius() { return R_radius.data(); } - value_t *get_R() { return R.data(); } - const value_t *get_X() { return X; } + value_idx* get_R_indptr() { return R_indptr.data(); } + value_idx* get_R_1nn_cols() { return R_1nn_cols.data(); } + value_t* get_R_1nn_dists() { return R_1nn_dists.data(); } + value_t* get_R_radius() { return R_radius.data(); } + value_t* get_R() { return R.data(); } + const value_t* get_X() { return X; } bool is_index_trained() const { return index_trained; }; // This should only be set by internal functions void set_index_trained() { index_trained = true; } - const raft::handle_t &handle; + const raft::handle_t& handle; const value_int m; const value_int n; const value_int n_landmarks; - const value_t *X; + const value_t* X; raft::distance::DistanceType metric; diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 980001f166..7f4e4511d2 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -55,90 +55,84 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype( - QuantizerType qtype) { +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) +{ switch (qtype) { - case QuantizerType::QT_8bit: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case QuantizerType::QT_8bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; case QuantizerType::QT_4bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: - return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; case QuantizerType::QT_8bit_direct: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: - return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: - return (faiss::ScalarQuantizer::QuantizerType)qtype; + case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: return (faiss::ScalarQuantizer::QuantizerType)qtype; } } template -void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfflat_build_index( + knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; + config.device = index->device; faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat( - index->gpu_res, D, params->nlist, faiss_metric, config); + faiss::gpu::GpuIndexIVFFlat* faiss_index = + new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfpq_build_index( + knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params->usePrecomputedTables; - config.interleavedLayout = params->n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFPQ *faiss_index = - new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M, - params->n_bits, faiss_metric, config); + config.device = index->device; + config.usePrecomputedTables = params->usePrecomputedTables; + config.interleavedLayout = params->n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ( + index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfsq_build_index( + knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = - build_faiss_qtype(params->qtype); - faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index = - new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist, - faiss_qtype, faiss_metric, - params->encodeResidual); + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params->qtype); + faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer( + index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - raft::spatial::knn::knnIndexParam *params, +void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + raft::spatial::knn::knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, IntType n, - IntType D) { + float metricArg, + float* index_array, + IntType n, + IntType D) +{ int device; CUDA_CHECK(cudaGetDevice(&device)); - faiss::gpu::StandardGpuResources *gpu_res = - new faiss::gpu::StandardGpuResources(); + faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); gpu_res->setDefaultStream(device, handle.get_stream()); - index->gpu_res = gpu_res; - index->device = device; - index->index = nullptr; - index->metric = metric; + index->gpu_res = gpu_res; + index->device = device; + index->index = nullptr; + index->metric = metric; index->metricArg = metricArg; // perform preprocessing @@ -148,21 +142,20 @@ void approx_knn_build_index(raft::handle_t &handle, query_metric_processor->preprocess(index_array); - if (dynamic_cast(params)) { - IVFFlatParam *IVFFlat_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFFlatParam* IVFFlat_param = dynamic_cast(params); approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D); std::vector h_index_array(n * D); - raft::update_host(h_index_array.data(), index_array, h_index_array.size(), - handle.get_stream()); + raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream()); query_metric_processor->revert(index_array); index->index->train(n, h_index_array.data()); index->index->add(n, h_index_array.data()); } else { - if (dynamic_cast(params)) { - IVFPQParam *IVFPQ_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFPQParam* IVFPQ_param = dynamic_cast(params); approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D); - } else if (dynamic_cast(params)) { - IVFSQParam *IVFSQ_param = dynamic_cast(params); + } else if (dynamic_cast(params)) { + IVFSQParam* IVFSQ_param = dynamic_cast(params); approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D); } else { ASSERT(index->index, "KNN index could not be initialized"); @@ -175,13 +168,17 @@ void approx_knn_build_index(raft::handle_t &handle, } template -void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, raft::spatial::knn::knnIndex *index, - IntType k, float *query_array, IntType n) { +void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + IntType k, + float* query_array, + IntType n) +{ // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream()); + create_processor(index->metric, n, index->index->d, k, false, handle.get_stream()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); @@ -192,13 +189,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances, index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || index->metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (index->metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / index->metricArg; + if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; raft::linalg::unaryOp( - distances, distances, n * k, + distances, + distances, + n * k, [p] __device__(float input) { return powf(input, p); }, handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 7354fa3497..7b54c3d25b 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -60,34 +60,43 @@ namespace detail { * @param handle * @param index */ -template -void sample_landmarks(const raft::handle_t &handle, - BallCoverIndex &index) { - rmm::device_uvector R_1nn_cols2(index.n_landmarks, - handle.get_stream()); +template +void sample_landmarks(const raft::handle_t& handle, + BallCoverIndex& index) +{ + rmm::device_uvector R_1nn_cols2(index.n_landmarks, handle.get_stream()); rmm::device_uvector R_1nn_ones(index.m, handle.get_stream()); - rmm::device_uvector R_indices(index.n_landmarks, - handle.get_stream()); + rmm::device_uvector R_indices(index.n_landmarks, handle.get_stream()); - thrust::sequence(handle.get_thrust_policy(), index.get_R_1nn_cols(), - index.get_R_1nn_cols() + index.m, (value_idx)0); + thrust::sequence(handle.get_thrust_policy(), + index.get_R_1nn_cols(), + index.get_R_1nn_cols() + index.m, + (value_idx)0); - thrust::fill(handle.get_thrust_policy(), R_1nn_ones.data(), - R_1nn_ones.data() + R_1nn_ones.size(), 1.0); + thrust::fill( + handle.get_thrust_policy(), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0); /** - * 1. Randomly sample sqrt(n) points from X - */ + * 1. Randomly sample sqrt(n) points from X + */ auto rng = raft::random::Rng(12345); - rng.sampleWithoutReplacement(handle, R_indices.data(), R_1nn_cols2.data(), - index.get_R_1nn_cols(), R_1nn_ones.data(), - (value_idx)index.n_landmarks, (value_idx)index.m, + rng.sampleWithoutReplacement(handle, + R_indices.data(), + R_1nn_cols2.data(), + index.get_R_1nn_cols(), + R_1nn_ones.data(), + (value_idx)index.n_landmarks, + (value_idx)index.m, handle.get_stream()); - raft::matrix::copyRows( - index.get_X(), index.m, index.n, index.get_R(), R_1nn_cols2.data(), - index.n_landmarks, handle.get_stream(), true); + raft::matrix::copyRows(index.get_X(), + index.m, + index.n, + index.get_R(), + R_1nn_cols2.data(), + index.n_landmarks, + handle.get_stream(), + true); } /** @@ -100,35 +109,34 @@ void sample_landmarks(const raft::handle_t &handle, * @param k * @param index */ -template -void construct_landmark_1nn( - const raft::handle_t &handle, const value_idx *R_knn_inds_ptr, - const value_t *R_knn_dists_ptr, value_int k, - BallCoverIndex &index) { +template +void construct_landmark_1nn(const raft::handle_t& handle, + const value_idx* R_knn_inds_ptr, + const value_t* R_knn_dists_ptr, + value_int k, + BallCoverIndex& index) +{ rmm::device_uvector R_1nn_inds(index.m, handle.get_stream()); - value_idx *R_1nn_inds_ptr = R_1nn_inds.data(); - value_t *R_1nn_dists_ptr = index.get_R_1nn_dists(); + value_idx* R_1nn_inds_ptr = R_1nn_inds.data(); + value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); auto idxs = thrust::make_counting_iterator(0); - thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, - [=] __device__(value_idx i) { - R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k]; - R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; - }); + thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, [=] __device__(value_idx i) { + R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k]; + R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; + }); - auto keys = thrust::make_zip_iterator( - thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists())); + auto keys = + thrust::make_zip_iterator(thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists())); // group neighborhoods for each reference landmark and sort each group by distance - thrust::sort_by_key(handle.get_thrust_policy(), keys, keys + index.m, - index.get_R_1nn_cols(), NNComp()); + thrust::sort_by_key( + handle.get_thrust_policy(), keys, keys + index.m, index.get_R_1nn_cols(), NNComp()); // convert to CSR for fast lookup raft::sparse::convert::sorted_coo_to_csr( - R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, - handle.get_stream()); + R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, handle.get_stream()); } /** @@ -144,20 +152,33 @@ void construct_landmark_1nn( * @param R_knn_inds * @param R_knn_dists */ -template -void k_closest_landmarks(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query_pts, value_int n_query_pts, - value_int k, value_idx *R_knn_inds, - value_t *R_knn_dists) { - std::vector input = {index.get_R()}; +template +void k_closest_landmarks(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query_pts, + value_int n_query_pts, + value_int k, + value_idx* R_knn_inds, + value_t* R_knn_dists) +{ + std::vector input = {index.get_R()}; std::vector sizes = {index.n_landmarks}; - brute_force_knn_impl( - input, sizes, index.n, const_cast(query_pts), n_query_pts, - R_knn_inds, R_knn_dists, k, handle.get_stream(), nullptr, 0, true, true, - nullptr, index.metric); + brute_force_knn_impl(input, + sizes, + index.n, + const_cast(query_pts), + n_query_pts, + R_knn_inds, + R_knn_dists, + k, + handle.get_stream(), + nullptr, + 0, + true, + true, + nullptr, + index.metric); } /** @@ -168,21 +189,21 @@ void k_closest_landmarks(const raft::handle_t &handle, * @param handle * @param index */ -template -void compute_landmark_radii( - const raft::handle_t &handle, - BallCoverIndex &index) { +template +void compute_landmark_radii(const raft::handle_t& handle, + BallCoverIndex& index) +{ auto entries = thrust::make_counting_iterator(0); - const value_idx *R_indptr_ptr = index.get_R_indptr(); - const value_t *R_1nn_dists_ptr = index.get_R_1nn_dists(); - value_t *R_radius_ptr = index.get_R_radius(); - thrust::for_each(handle.get_thrust_policy(), entries, + const value_idx* R_indptr_ptr = index.get_R_indptr(); + const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); + value_t* R_radius_ptr = index.get_R_radius(); + thrust::for_each(handle.get_thrust_policy(), + entries, entries + index.n_landmarks, [=] __device__(value_idx input) { value_idx last_row_idx = R_indptr_ptr[input + 1] - 1; - R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx]; + R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx]; }); } @@ -196,23 +217,51 @@ void compute_landmark_radii( * marking the distance to be computed between x, y only * if knn[k].distance >= d(x_i, R_k) + d(R_k, y) */ -template -void perform_rbc_query(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, value_int n_query_pts, - std::uint32_t k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, - value_int *dists_counter, value_int *post_dists_counter, - float weight = 1.0, bool perform_post_filtering = true) { +template +void perform_rbc_query(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + value_int n_query_pts, + std::uint32_t k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + value_int* dists_counter, + value_int* post_dists_counter, + float weight = 1.0, + bool perform_post_filtering = true) +{ // Compute nearest k for each neighborhood in each closest R - rbc_low_dim_pass_one(handle, index, query, n_query_pts, k, R_knn_inds, - R_knn_dists, dfunc, inds, dists, weight, dists_counter); + rbc_low_dim_pass_one(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + dists_counter); if (perform_post_filtering) { - rbc_low_dim_pass_two(handle, index, query, n_query_pts, k, R_knn_inds, - R_knn_dists, dfunc, inds, dists, weight, + rbc_low_dim_pass_two(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, post_dists_counter); } } @@ -228,13 +277,15 @@ void perform_rbc_query(const raft::handle_t &handle, * query which is useful for algorithms that need to perform * A * A.T. */ -template -void rbc_build_index(const raft::handle_t &handle, - BallCoverIndex &index, - distance_func dfunc) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); +template +void rbc_build_index(const raft::handle_t& handle, + BallCoverIndex& index, + distance_func dfunc) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(!index.is_index_trained(), "index cannot be previously trained"); rmm::device_uvector R_knn_inds(index.m, handle.get_stream()); @@ -249,8 +300,8 @@ void rbc_build_index(const raft::handle_t &handle, * 2. Perform knn = bfknn(X, R, k) */ value_int k = 1; - k_closest_landmarks(handle, index, index.get_X(), index.m, k, - R_knn_inds.data(), R_knn_dists.data()); + k_closest_landmarks( + handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); /** * 3. Create L_r = knn[:,0].T (CSR) @@ -258,8 +309,7 @@ void rbc_build_index(const raft::handle_t &handle, * Slice closest neighboring R * Secondary sort by (R_knn_inds, R_knn_dists) */ - construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, - index); + construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); /** * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r) @@ -271,16 +321,21 @@ void rbc_build_index(const raft::handle_t &handle, /** * Performs an all neighbors knn query (e.g. index == query) */ -template -void rbc_all_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, value_idx *inds, value_t *dists, +template +void rbc_all_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + value_idx* inds, + value_t* dists, distance_func dfunc, // approximate nn options - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); ASSERT(!index.is_index_trained(), "index cannot be previously trained"); @@ -289,22 +344,30 @@ void rbc_all_knn_query(const raft::handle_t &handle, // For debugging / verification. Remove before releasing rmm::device_uvector dists_counter(index.m, handle.get_stream()); - rmm::device_uvector post_dists_counter(index.m, - handle.get_stream()); + rmm::device_uvector post_dists_counter(index.m, handle.get_stream()); sample_landmarks(handle, index); - k_closest_landmarks(handle, index, index.get_X(), index.m, k, - R_knn_inds.data(), R_knn_dists.data()); + k_closest_landmarks( + handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); - construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, - index); + construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); compute_landmark_radii(handle, index); - perform_rbc_query(handle, index, index.get_X(), index.m, k, R_knn_inds.data(), - R_knn_dists.data(), dfunc, inds, dists, - dists_counter.data(), post_dists_counter.data(), weight, + perform_rbc_query(handle, + index, + index.get_X(), + index.m, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, perform_post_filtering); } @@ -312,35 +375,50 @@ void rbc_all_knn_query(const raft::handle_t &handle, * Performs a knn query against an index. This assumes the index has * already been built. */ -template -void rbc_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, const value_t *query, value_int n_query_pts, - value_idx *inds, value_t *dists, distance_func dfunc, +template +void rbc_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + const value_t* query, + value_int n_query_pts, + value_idx* inds, + value_t* dists, + distance_func dfunc, // approximate nn options - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); ASSERT(index.is_index_trained(), "index must be previously trained"); rmm::device_uvector R_knn_inds(k * index.m, handle.get_stream()); rmm::device_uvector R_knn_dists(k * index.m, handle.get_stream()); - k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), - R_knn_dists.data()); + k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data()); // For debugging / verification. Remove before releasing rmm::device_uvector dists_counter(index.m, handle.get_stream()); - rmm::device_uvector post_dists_counter(index.m, - handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), post_dists_counter.data(), - post_dists_counter.data() + index.m, 0); - - perform_rbc_query(handle, index, query, n_query_pts, k, R_knn_inds.data(), - R_knn_dists.data(), dfunc, inds, dists, - dists_counter.data(), post_dists_counter.data(), weight, + rmm::device_uvector post_dists_counter(index.m, handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), post_dists_counter.data(), post_dists_counter.data() + index.m, 0); + + perform_rbc_query(handle, + index, + query, + n_query_pts, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, perform_post_filtering); } diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh index c6cb679408..181dad1a90 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh @@ -27,7 +27,8 @@ namespace detail { struct NNComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's reference landmark, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -39,17 +40,20 @@ struct NNComp { struct HaversineFunc { template - __device__ __host__ __forceinline__ value_t - operator()(const value_t *a, const value_t *b, const value_int n_dims) { - return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], - b[1]); + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) + { + return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]); } }; struct EuclideanFunc { template - __device__ __host__ __forceinline__ value_t - operator()(const value_t *a, const value_t *b, const value_int n_dims) { + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) + { value_t sum_sq = 0; for (value_int i = 0; i < n_dims; ++i) { value_t diff = a[i] - b[i]; @@ -63,7 +67,8 @@ struct EuclideanFunc { /** * Zeros the bit at location h in a one-hot encoded 32-bit int array */ -__device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { +__device__ inline void _zero_bit(std::uint32_t* arr, std::uint32_t h) +{ int bit = h % 32; int idx = h / 32; @@ -71,7 +76,7 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { std::uint32_t old = arr[idx]; do { assumed = old; - old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit)); + old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit)); } while (assumed != old); } @@ -79,7 +84,8 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { * Returns whether or not bit at location h is nonzero in a one-hot * encoded 32-bit in array. */ -__device__ inline bool _get_val(std::uint32_t *arr, std::uint32_t h) { +__device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h) +{ int bit = h % 32; int idx = h / 32; return (arr[idx] & (1 << bit)) > 0; diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh index 4a476274dd..5d28258f7a 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh @@ -58,14 +58,24 @@ namespace detail { * @param output * @param weight */ -template -__global__ void perform_post_filter_registers( - const value_t *X, value_int n_cols, const value_idx *R_knn_inds, - const value_t *R_knn_dists, const value_t *R_radius, const value_t *landmarks, - int n_landmarks, value_int bitset_size, value_int k, distance_func dfunc, - std::uint32_t *output, float weight = 1.0) { +__global__ void perform_post_filter_registers(const value_t* X, + value_int n_cols, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + const value_t* R_radius, + const value_t* landmarks, + int n_landmarks, + value_int bitset_size, + value_int k, + distance_func dfunc, + std::uint32_t* output, + float weight = 1.0) +{ // allocate array of size n_landmarks / 32 ints extern __shared__ std::uint32_t shared_mem[]; @@ -98,8 +108,7 @@ __global__ void perform_post_filter_registers( for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) { // compute p(q, r) value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols); - if (dist > weight * (closest_R_dist + R_radius[l]) || - dist > 3 * closest_R_dist) { + if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) { _zero_bit(shared_mem, l); } } @@ -136,38 +145,58 @@ __global__ void perform_post_filter_registers( * @param k * @param dist_counter */ -template -__global__ void compute_final_dists_registers( - const value_t *X_index, const value_t *X, const value_int n_cols, - bitset_type *bitset, value_int bitset_size, const value_t *R_knn_dists, - const value_idx *R_indptr, const value_idx *R_1nn_inds, - const value_t *R_1nn_dists, value_idx *knn_inds, value_t *knn_dists, - value_int n_landmarks, value_int k, dist_func dfunc, - value_int *dist_counter) { +template +__global__ void compute_final_dists_registers(const value_t* X_index, + const value_t* X, + const value_int n_cols, + bitset_type* bitset, + value_int bitset_size, + const value_t* R_knn_dists, + const value_idx* R_indptr, + const value_idx* R_1nn_inds, + const value_t* R_1nn_dists, + value_idx* knn_inds, + value_t* knn_dists, + value_int n_landmarks, + value_int k, + dist_func dfunc, + value_int* dist_counter) +{ static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair - shared_memV[kNumWarps * warp_q]; + __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; - const value_t *x_ptr = X + (n_cols * blockIdx.x); + const value_t* x_ptr = X + (n_cols * blockIdx.x); value_t local_x_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_x_ptr[j] = x_ptr[j]; } - faiss::gpu::KeyValueBlockSelect, warp_q, - thread_q, tpb> + faiss::gpu::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), -1, shared_memK, shared_memV, + faiss::gpu::Limits::getMax(), + -1, + shared_memK, + shared_memV, k); const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize); - value_int i = threadIdx.x; + value_int i = threadIdx.x; for (; i < n_k; i += tpb) { value_idx ind = knn_inds[blockIdx.x * k + i]; heap.add(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind); @@ -185,33 +214,31 @@ __global__ void compute_final_dists_registers( // candidate if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) { value_idx R_start_offset = R_indptr[cur_R_ind]; - value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; - value_idx R_size = R_stop_offset - R_start_offset; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + value_idx R_size = R_stop_offset - R_start_offset; // Loop through R's neighborhood in parallel // Round R_size to the nearest warp threads so they can // all be computing in parallel. - const value_int limit = - faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); i = threadIdx.x; for (; i < limit; i += tpb) { value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - z = isnan(z) ? 0.0 : z; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + z = isnan(z) ? 0.0 : z; // If lower bound on distance could possibly be in // the closest k neighbors, compute it and add to k-select value_t dist = std::numeric_limits::max(); if (z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -226,21 +253,20 @@ __global__ void compute_final_dists_registers( // second round guarantees to be only a single warp. if (i < R_size) { value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; z = isnan(z) ? 0.0 : z; // If lower bound on distance could possibly be in // the closest k neighbors, compute it and add to k-select value_t dist = std::numeric_limits::max(); if (z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -257,7 +283,7 @@ __global__ void compute_final_dists_registers( for (value_int i = threadIdx.x; i < k; i += tpb) { knn_dists[blockIdx.x * k + i] = shared_memK[i]; - knn_inds[blockIdx.x * k + i] = shared_memV[i].value; + knn_inds[blockIdx.x * k + i] = shared_memV[i].value; } } @@ -278,28 +304,41 @@ __global__ void compute_final_dists_registers( * @param R_1nn_cols * @param R_1nn_dists */ -template -__global__ void block_rbc_kernel_registers( - const value_t *X_index, const value_t *X, - value_int n_cols, // n_cols should be 2 or 3 dims - const value_idx *R_knn_inds, const value_t *R_knn_dists, value_int m, - value_int k, const value_idx *R_indptr, const value_idx *R_1nn_cols, - const value_t *R_1nn_dists, value_idx *out_inds, value_t *out_dists, - value_int *dist_counter, value_t *R_radius, distance_func dfunc, - float weight = 1.0) { +template +__global__ void block_rbc_kernel_registers(const value_t* X_index, + const value_t* X, + value_int n_cols, // n_cols should be 2 or 3 dims + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + value_int m, + value_int k, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + value_idx* out_inds, + value_t* out_dists, + value_int* dist_counter, + value_t* R_radius, + distance_func dfunc, + float weight = 1.0) +{ static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair - shared_memV[kNumWarps * warp_q]; + __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; // TODO: Separate kernels for different widths: // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x" // 2. Can fit comfortably in shared memory (32 to a few thousand?) // 3. Load each time individually. - const value_t *x_ptr = X + (n_cols * blockIdx.x); + const value_t* x_ptr = X + (n_cols * blockIdx.x); // Use registers only for 2d or 3d value_t local_x_ptr[col_q]; @@ -308,11 +347,18 @@ __global__ void block_rbc_kernel_registers( } // Each warp works on 1 R - faiss::gpu::KeyValueBlockSelect, warp_q, - thread_q, tpb> + faiss::gpu::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), -1, shared_memK, shared_memV, + faiss::gpu::Limits::getMax(), + -1, + shared_memK, + shared_memV, k); value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)]; @@ -327,7 +373,7 @@ __global__ void block_rbc_kernel_registers( // determining if the distance could even potentially be in the heap. for (value_int cur_k = 0; cur_k < k; ++cur_k) { // index and distance to current blockIdx.x's closest landmark - value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k]; + value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k]; value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k]; // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q) @@ -336,38 +382,37 @@ __global__ void block_rbc_kernel_registers( // The whole warp should iterate through the elements in the current R value_idx R_start_offset = R_indptr[cur_R_ind]; - value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; value_idx R_size = R_stop_offset - R_start_offset; - value_int limit = - faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); - value_int i = threadIdx.x; + value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + value_int i = threadIdx.x; for (; i < limit; i += tpb) { // Index and distance of current candidate's nearest landmark value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap // and l_2 is the current landmark R. s is the current data point and // t is the new candidate data point. We know that: - // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - d(l_2, t) | - d(s, l_1) * d(l_2, t) + // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - + // d(l_2, t) | - d(s, l_1) * d(l_2, t) - // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to the candidate point - // cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, l_1) then we should compute the - // distance because it's possible it could be smaller. + // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to + // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, + // l_1) then we should compute the distance because it's possible it could be smaller. // - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - - z = isnan(z) ? 0.0 : z; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) ? 0.0 : z; value_t dist = std::numeric_limits::max(); if (i < k || z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -381,18 +426,17 @@ __global__ void block_rbc_kernel_registers( if (i < R_size) { value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.0 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - - z = isnan(z) ? 0.0 : z; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t z = heap.warpKTopRDist == 0.0 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) ? 0.0 : z; value_t dist = std::numeric_limits::max(); if (i < k || z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -411,124 +455,327 @@ __global__ void block_rbc_kernel_registers( for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = shared_memK[i]; - out_inds[blockIdx.x * k + i] = shared_memV[i].value; + out_inds[blockIdx.x * k + i] = shared_memV[i].value; } } -template -void rbc_low_dim_pass_one(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, const value_int n_query_rows, - value_int k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, float weight, - value_int *dists_counter) { +template +void rbc_low_dim_pass_one(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* dists_counter) +{ if (k <= 32) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 64) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 128) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 256) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 512) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 1024) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); } -template -void rbc_low_dim_pass_two(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, const value_int n_query_rows, - value_int k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, float weight, - value_int *post_dists_counter) { +template +void rbc_low_dim_pass_two(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* post_dists_counter) +{ const value_int bitset_size = ceil(index.n_landmarks / 32.0); - rmm::device_uvector bitset(bitset_size * index.m, - handle.get_stream()); + rmm::device_uvector bitset(bitset_size * index.m, handle.get_stream()); perform_post_filter_registers - <<>>(index.get_X(), index.n, R_knn_inds, R_knn_dists, - index.get_R_radius(), index.get_R(), - index.n_landmarks, bitset_size, k, dfunc, - bitset.data(), weight); + <<>>( + index.get_X(), + index.n, + R_knn_inds, + R_knn_dists, + index.get_R_radius(), + index.get_R(), + index.n_landmarks, + bitset_size, + k, + dfunc, + bitset.data(), + weight); if (k <= 32) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 64) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 128) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 256) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 512) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 1024) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); } }; // namespace detail diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh index d2f7bc2210..a53a5b03e6 100644 --- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh @@ -25,15 +25,19 @@ namespace gpu { // `Dir` true, produce largest values. // `Dir` false, produce smallest values. -template +template struct KeyValueBlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; static constexpr int kTotalWarpSortSize = NumWarpQ; - __device__ inline KeyValueBlockSelect(K initKVal, K initVKey, V initVVal, - K* smemK, KeyValuePair* smemV, - int k) + __device__ inline KeyValueBlockSelect( + K initKVal, K initVKey, V initVVal, K* smemK, KeyValuePair* smemV, int k) : initK(initKVal), initVk(initVKey), initVv(initVVal), @@ -42,53 +46,55 @@ struct KeyValueBlockSelect { warpKTopRDist(initKVal), sharedK(smemK), sharedV(smemV), - kMinus1(k - 1) { - static_assert(utils::isPowerOf2(ThreadsPerBlock), - "threads must be a power-of-2"); + kMinus1(k - 1) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); // Fill the per-thread queue keys with the default value #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; + threadK[i] = initK; + threadV[i].key = initVk; threadV[i].value = initVv; } int laneId = getLaneId(); int warpId = threadIdx.x / kWarpSize; - warpK = sharedK + warpId * kTotalWarpSortSize; - warpV = sharedV + warpId * kTotalWarpSortSize; + warpK = sharedK + warpId * kTotalWarpSortSize; + warpV = sharedV + warpId * kTotalWarpSortSize; // Fill warp queue (only the actual queue space is fine, not where // we write the per-thread queues for merging) for (int i = laneId; i < NumWarpQ; i += kWarpSize) { - warpK[i] = initK; - warpV[i].key = initVk; + warpK[i] = initK; + warpV[i].key = initVk; warpV[i].value = initVv; } warpFence(); } - __device__ inline void addThreadQ(K k, K vk, V vv) { + __device__ inline void addThreadQ(K k, K vk, V vv) + { if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { // Rotate right #pragma unroll for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i].key = threadV[i - 1].key; + threadK[i] = threadK[i - 1]; + threadV[i].key = threadV[i - 1].key; threadV[i].value = threadV[i - 1].value; } - threadK[0] = k; - threadV[0].key = vk; + threadK[0] = k; + threadV[0].key = vk; threadV[0].value = vv; ++numVals; } } - __device__ inline void checkThreadQ() { + __device__ inline void checkThreadQ() + { bool needSort = (numVals == NumThreadQ); #if CUDA_VERSION >= 9000 @@ -111,13 +117,13 @@ struct KeyValueBlockSelect { #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; + threadK[i] = initK; + threadV[i].key = initVk; threadV[i].value = initVv; } // We have to beat at least this element - warpKTop = warpK[kMinus1]; + warpKTop = warpK[kMinus1]; warpKTopRDist = warpV[kMinus1].key; warpFence(); @@ -126,7 +132,8 @@ struct KeyValueBlockSelect { /// This function handles sorting and merging together the /// per-thread queues with the warp-wide queue, creating a sorted /// list across both - __device__ inline void mergeWarpQ() { + __device__ inline void mergeWarpQ() + { int laneId = getLaneId(); // Sort all of the per-thread queues @@ -138,8 +145,8 @@ struct KeyValueBlockSelect { #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpKRegisters[i] = warpK[i * kWarpSize + laneId]; - warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key; + warpKRegisters[i] = warpK[i * kWarpSize + laneId]; + warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key; warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value; } @@ -148,15 +155,14 @@ struct KeyValueBlockSelect { // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list - warpMergeAnyRegistersKVP(warpKRegisters, warpVRegisters, threadK, - threadV); + warpMergeAnyRegistersKVP( + warpKRegisters, warpVRegisters, threadK, threadV); // Write back out the warp queue #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * kWarpSize + laneId] = warpKRegisters[i]; - warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key; + warpK[i * kWarpSize + laneId] = warpKRegisters[i]; + warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key; warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value; } @@ -165,12 +171,14 @@ struct KeyValueBlockSelect { /// WARNING: all threads in a warp must participate in this. /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, K vk, V vv) { + __device__ inline void add(K k, K vk, V vv) + { addThreadQ(k, vk, vv); checkThreadQ(); } - __device__ inline void reduce() { + __device__ inline void reduce() + { // Have all warps dump and merge their queues; this will produce // the final per-warp results mergeWarpQ(); @@ -182,8 +190,8 @@ struct KeyValueBlockSelect { // All warp queues are contiguous in smem. // Now, we have kNumWarps lists of NumWarpQ elements. // This is a power of 2. - FinalBlockMerge, NumWarpQ, - Dir, Comp>::merge(sharedK, sharedV); + FinalBlockMerge, NumWarpQ, Dir, Comp>::merge( + sharedK, sharedV); // The block-wide merge has a trailing syncthreads } diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 0c0398a336..5618186dfc 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -27,37 +27,26 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::MetricType build_faiss_metric( - raft::distance::DistanceType metric) { +inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric) +{ switch (metric) { case raft::distance::DistanceType::CosineExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; case raft::distance::DistanceType::CorrelationExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::L2Expanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2Unexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtExpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtUnexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L1: - return faiss::MetricType::METRIC_L1; - case raft::distance::DistanceType::InnerProduct: - return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::LpUnexpanded: - return faiss::MetricType::METRIC_Lp; - case raft::distance::DistanceType::Linf: - return faiss::MetricType::METRIC_Linf; - case raft::distance::DistanceType::Canberra: - return faiss::MetricType::METRIC_Canberra; - case raft::distance::DistanceType::BrayCurtis: - return faiss::MetricType::METRIC_BrayCurtis; + case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; + case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT; + case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; + case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; + case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; + case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis; case raft::distance::DistanceType::JensenShannon: return faiss::MetricType::METRIC_JensenShannon; - default: - THROW("MetricType not supported: %d", metric); + default: THROW("MetricType not supported: %d", metric); } } diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index f774d9d1ea..47fc62066d 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -29,19 +29,21 @@ namespace knn { namespace detail { template -DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m, - const unsigned int numOfNN) { +DI void loadAllWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const IdxT m, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; heapArr[i]->warpV[j] = KVPair.key; heapArr[i]->warpK[j] = KVPair.value; } @@ -51,14 +53,17 @@ DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m, } template -DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId, - const unsigned int numOfNN) { +DI void loadWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const int rowId, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; heapArr->warpV[j] = KVPair.key; heapArr->warpK[j] = KVPair.value; } @@ -66,25 +71,31 @@ DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId, } template -DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId, - const unsigned int numOfNN) { +DI void storeWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const IdxT rowId, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]); + Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]); shDumpKV[rowId * numOfNN + idx] = otherKV; } } } -template -DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds, - const IdxT m, const unsigned int numOfNN, - const IdxT starty) { +template +DI void storeWarpQGmem(myWarpSelect& heapArr, + OutT* out_dists, + IdxT* out_inds, + const IdxT m, + const unsigned int numOfNN, + const IdxT starty) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { @@ -95,18 +106,21 @@ DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds, const auto idx = j * warpSize + lid; if (idx < numOfNN) { out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j]; - out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j]; + out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j]; } } } } } -template -DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists, - IdxT *out_inds, const IdxT m, - const unsigned int numOfNN, const IdxT starty) { +template +DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr, + OutT* out_dists, + IdxT* out_inds, + const IdxT m, + const unsigned int numOfNN, + const IdxT starty) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { @@ -121,17 +135,17 @@ DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists, } } auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1; - heapArr[i]->warpKTop = - raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane); + heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane); } } } template -DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, - int finalNumVals, int startId = 0) { +DI void updateSortedWarpQ( + myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0) +{ constexpr uint32_t mask = 0xffffffffu; - const int lid = raft::laneId(); + const int lid = raft::laneId(); // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30. // warp around 0 to 31 required for NN > 32 const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1); @@ -140,12 +154,11 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, Pair KVPair = allWarpTopKs[rowId * (256) + k]; #pragma unroll for (int i = 0; i < NumWarpQRegs; i++) { - unsigned activeLanes = - __ballot_sync(mask, KVPair.value < heapArr->warpK[i]); + unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]); if (activeLanes) { Pair tempKV; - tempKV.value = raft::shfl(heapArr->warpK[i], srcLane); - tempKV.key = raft::shfl(heapArr->warpV[i], srcLane); + tempKV.value = raft::shfl(heapArr->warpK[i], srcLane); + tempKV.key = raft::shfl(heapArr->warpV[i], srcLane); const auto firstActiveLane = __ffs(activeLanes) - 1; if (firstActiveLane == lid) { heapArr->warpK[i] = KVPair.value; @@ -168,43 +181,60 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, } } -template -__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( - const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn, - const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb, - const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt, - unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) { +template +__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + const IdxT m, + const IdxT n, + const IdxT k, + const IdxT lda, + const IdxT ldb, + const IdxT ldd, + CoreLambda core_op, + FinalLambda fin_op, + bool sqrt, + unsigned int numOfNN, + int* mutexes, + OutT* out_dists, + IdxT* out_inds) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair Pair; constexpr auto identity = std::numeric_limits::max(); - constexpr auto keyMax = std::numeric_limits::max(); - constexpr auto Dir = false; - typedef faiss::gpu::WarpSelect< - AccT, uint32_t, Dir, faiss::gpu::Comparator, NumWarpQ, NumThreadQ, 32> - myWarpSelect; - - auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, - mutexes] __device__(IdxT gridStrideY) { - if (gridDim.x == 1) { - return; - } + constexpr auto keyMax = std::numeric_limits::max(); + constexpr auto Dir = false; + typedef faiss::gpu:: + WarpSelect, NumWarpQ, NumThreadQ, 32> + myWarpSelect; - volatile int *mutex = mutexes; + auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__( + IdxT gridStrideY) { + if (gridDim.x == 1) { return; } - Pair *shDumpKV = nullptr; + volatile int* mutex = mutexes; + + Pair* shDumpKV = nullptr; if (useNorms) { - shDumpKV = - (Pair *)(&smem[Policy::SmemSize + - ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]); } else { - shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize]); } - const int lid = threadIdx.x % warpSize; + const int lid = threadIdx.x % warpSize; const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); // 0 -> consumer done consuming the buffer. @@ -215,7 +245,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( auto cta_processed = 0; myWarpSelect heapArr1(identity, keyMax, numOfNN); myWarpSelect heapArr2(identity, keyMax, numOfNN); - myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + myWarpSelect* heapArr[] = {&heapArr1, &heapArr2}; __syncwarp(); loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); @@ -224,7 +254,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( if (threadIdx.x == 0) { int32_t old = -3; while (old != -1) { - old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1); + old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1); } __threadfence(); } @@ -232,18 +262,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { Pair otherKV; - otherKV.value = identity; - otherKV.key = keyMax; + otherKV.value = identity; + otherKV.key = keyMax; const auto idx = j * warpSize + lid; if (idx < numOfNN && rowId < m) { - otherKV.value = out_dists[rowId * numOfNN + idx]; - otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx]; + otherKV.value = out_dists[rowId * numOfNN + idx]; + otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx]; shDumpKV[shMemRowId * numOfNN + idx] = otherKV; } } @@ -260,19 +289,16 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { Pair otherKV; - otherKV.value = identity; - otherKV.key = keyMax; + otherKV.value = identity; + otherKV.key = keyMax; const auto idx = j * warpSize + lid; - if (idx < numOfNN) { - otherKV = shDumpKV[shMemRowId * numOfNN + idx]; - } + if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; } heapArr[i]->add(otherKV.value, otherKV.key); } } @@ -284,20 +310,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( const auto rowId = starty + i * Policy::AccThRows; if (rowId < m) { bool needSort = (heapArr[i]->numVals > 0); - needSort = __any_sync(0xffffffff, needSort); - if (needSort) { - heapArr[i]->reduce(); - } + needSort = __any_sync(0xffffffff, needSort); + if (needSort) { heapArr[i]->reduce(); } } } - storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, - starty); + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, starty); } else { if (threadIdx.x == 0) { - int32_t old = -1; + int32_t old = -1; int32_t blkIdX = (int32_t)blockIdx.x; while (old != blkIdX) { - old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX); + old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX); } __threadfence(); } @@ -305,14 +328,13 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { for (int idx = lid; idx < numOfNN; idx += warpSize) { - Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx]; + Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx]; out_dists[rowId * numOfNN + idx] = KVPair.value; - out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key; + out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key; } } } @@ -328,7 +350,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( // epilogue operation lambda for final value calculation auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__( AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { if (useNorms) { #pragma unroll @@ -340,36 +364,34 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } } - Pair *shDumpKV = nullptr; + Pair* shDumpKV = nullptr; if (useNorms) { constexpr size_t shmemSize = Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); - shDumpKV = (Pair *)(&smem[shmemSize]); + shDumpKV = (Pair*)(&smem[shmemSize]); } else { - shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize]); } constexpr uint32_t mask = 0xffffffffu; - const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); - const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols); - const int lid = raft::laneId(); + const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); + const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols); + const int lid = raft::laneId(); myWarpSelect heapArr1(identity, keyMax, numOfNN); myWarpSelect heapArr2(identity, keyMax, numOfNN); - myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + myWarpSelect* heapArr[] = {&heapArr1, &heapArr2}; if (usePrevTopKs) { if (gridStrideX == blockIdx.x * Policy::Nblk) { - loadPrevTopKsGmemWarpQ(heapArr, out_dists, out_inds, m, - numOfNN, starty); + loadPrevTopKsGmemWarpQ(heapArr, out_dists, out_inds, m, numOfNN, starty); } } if (gridStrideX > blockIdx.x * Policy::Nblk) { #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; - Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1]; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1]; heapArr[i]->warpKTop = tempKV.value; } @@ -378,16 +400,14 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( int anyWarpTopKs = 0; #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; numValsWarpTopK[i] = 0; if (rowId < m) { #pragma unroll for (int j = 0; j < Policy::AccColsPerTh; ++j) { const auto colId = startx + j * Policy::AccThCols; if (colId < ldd) { - if (acc[i][j] < heapArr[i]->warpKTop) { - numValsWarpTopK[i]++; - } + if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; } } } anyWarpTopKs += numValsWarpTopK[i]; @@ -395,24 +415,21 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0); if (anyWarpTopKs) { - Pair *allWarpTopKs = (Pair *)(&smem[0]); + Pair* allWarpTopKs = (Pair*)(&smem[0]); uint32_t needScanSort[Policy::AccRowsPerTh]; #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { const auto gmemRowId = starty + i * Policy::AccThRows; - needScanSort[i] = 0; + needScanSort[i] = 0; if (gmemRowId < m) { - int myVals = numValsWarpTopK[i]; + int myVals = numValsWarpTopK[i]; needScanSort[i] = __ballot_sync(mask, myVals > 0); if (needScanSort[i]) { #pragma unroll for (unsigned int k = 1; k <= 16; k *= 2) { - const unsigned int n = - __shfl_up_sync(mask, numValsWarpTopK[i], k); - if (lid >= k) { - numValsWarpTopK[i] += n; - } + const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k); + if (lid >= k) { numValsWarpTopK[i] += n; } } } // As each thread will know its total vals to write. @@ -421,8 +438,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } if (needScanSort[i]) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (gmemRowId < m) { if (needScanSort[i] & ((uint32_t)1 << lid)) { #pragma unroll @@ -430,17 +446,15 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( const auto colId = startx + j * Policy::AccThCols; if (colId < ldd) { if (acc[i][j] < heapArr[i]->warpKTop) { - Pair otherKV = {colId, acc[i][j]}; - allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = - otherKV; + Pair otherKV = {colId, acc[i][j]}; + allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV; numValsWarpTopK[i]++; } } } } const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31); - loadWarpQShmem(heapArr[i], &shDumpKV[0], rowId, - numOfNN); + loadWarpQShmem(heapArr[i], &shDumpKV[0], rowId, numOfNN); updateSortedWarpQkNumWarpQRegisters>( heapArr[i], &allWarpTopKs[0], rowId, finalNumVals); } @@ -450,12 +464,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { if (needScanSort[i]) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; const auto gmemRowId = starty + i * Policy::AccThRows; if (gmemRowId < m) { - storeWarpQShmem(heapArr[i], shDumpKV, rowId, - numOfNN); + storeWarpQShmem(heapArr[i], shDumpKV, rowId, numOfNN); } } } @@ -463,28 +475,24 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } else { #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto gmemRowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto gmemRowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (gmemRowId < m) { #pragma unroll for (int j = 0; j < Policy::AccColsPerTh; ++j) { const auto colId = startx + j * Policy::AccThCols; - Pair otherKV = {keyMax, identity}; + Pair otherKV = {keyMax, identity}; if (colId < ldd) { otherKV.value = acc[i][j]; - otherKV.key = colId; + otherKV.key = colId; } heapArr[i]->add(otherKV.value, otherKV.key); } bool needSort = (heapArr[i]->numVals > 0); - needSort = __any_sync(mask, needSort); - if (needSort) { - heapArr[i]->reduce(); - } - storeWarpQShmem(heapArr[i], shDumpKV, shMemRowId, - numOfNN); + needSort = __any_sync(mask, needSort); + if (needSort) { heapArr[i]->reduce(); } + storeWarpQShmem(heapArr[i], shDumpKV, shMemRowId, numOfNN); } } } @@ -492,27 +500,64 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) { // This is last iteration of grid stride X loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); - storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, - starty); + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, starty); } }; - raft::distance::detail::PairwiseDistances< - useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda, - decltype(epilog_lambda), FinalLambda, decltype(rowEpilog_lambda), - isRowMajor, false> - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + raft::distance::detail::PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + _xn, + _yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, - OutT *out_dists, IdxT *out_inds, IdxT numOfNN, - cudaStream_t stream, void *workspace, - size_t &worksize) { +template +void fusedL2UnexpKnnImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ typedef typename raft::linalg::Policy2x8::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; @@ -532,12 +577,30 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef cub::KeyValuePair Pair; if (isRowMajor) { - constexpr auto fusedL2UnexpKnn32RowMajor = - fusedL2kNN; - constexpr auto fusedL2UnexpKnn64RowMajor = - fusedL2kNN; + constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN; + constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN; auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor; if (numOfNN <= 32) { @@ -545,13 +608,11 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, } else if (numOfNN <= 64) { fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor; } else { - ASSERT(numOfNN <= 64, - "fusedL2kNN: num of nearest neighbors must be <= 64"); + ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64"); } - const auto sharedMemSize = - KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair)); - dim3 grid = raft::distance::detail::launchConfigGenerator( + const auto sharedMemSize = KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair)); + dim3 grid = raft::distance::detail::launchConfigGenerator( m, n, sharedMemSize, fusedL2UnexpKnnRowMajor); if (grid.x > 1) { @@ -560,51 +621,133 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, worksize = sizeof(int32_t) * numMutexes; return; } else { - CUDA_CHECK( - cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); + CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); } } - fusedL2UnexpKnnRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt, - (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds); + fusedL2UnexpKnnRowMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + core_lambda, + fin_op, + sqrt, + (uint32_t)numOfNN, + (int*)workspace, + out_dists, + out_inds); } else { } CUDA_CHECK(cudaGetLastError()); } -template -void fusedL2UnexpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2UnexpKnn(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } } -template -void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2ExpKnnImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ typedef typename raft::linalg::Policy2x8::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; @@ -612,28 +755,43 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, ASSERT(isRowMajor, "Only Row major inputs are allowed"); - ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || - (worksize < m * sizeof(AccT))), + ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))), "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; }; typedef cub::KeyValuePair Pair; if (isRowMajor) { - constexpr auto fusedL2ExpKnn32RowMajor = - fusedL2kNN; - constexpr auto fusedL2ExpKnn64RowMajor = - fusedL2kNN; + constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN; + constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN; auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor; if (numOfNN <= 32) { @@ -641,77 +799,137 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, } else if (numOfNN <= 64) { fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor; } else { - ASSERT(numOfNN <= 64, - "fusedL2kNN: num of nearest neighbors must be <= 64"); + ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64"); } - const auto sharedMemSize = - KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) + - (KPolicy::Mblk * numOfNN * sizeof(Pair)); + const auto sharedMemSize = KPolicy::SmemSize + + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) + + (KPolicy::Mblk * numOfNN * sizeof(Pair)); dim3 grid = raft::distance::detail::launchConfigGenerator( m, n, sharedMemSize, fusedL2ExpKnnRowMajor); - int32_t *mutexes = nullptr; + int32_t* mutexes = nullptr; if (grid.x > 1) { - const auto numMutexes = raft::ceildiv(m, KPolicy::Mblk); - const auto normsSize = - (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT); + const auto numMutexes = raft::ceildiv(m, KPolicy::Mblk); + const auto normsSize = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT); const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize; if (worksize < requiredSize) { worksize = requiredSize; return; } else { - mutexes = (int32_t *)((char *)workspace + normsSize); - CUDA_CHECK( - cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream)); + mutexes = (int32_t*)((char*)workspace + normsSize); + CUDA_CHECK(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream)); } } - DataT *xn = (DataT *)workspace; - DataT *yn = (DataT *)workspace; + DataT* xn = (DataT*)workspace; + DataT* yn = (DataT*)workspace; auto norm_op = [] __device__(DataT in) { return in; }; if (x != y) { yn += m; - raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } - fusedL2ExpKnnRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt, - (uint32_t)numOfNN, mutexes, out_dists, out_inds); + fusedL2ExpKnnRowMajor<<>>(x, + y, + xn, + yn, + m, + n, + k, + lda, + ldb, + ldd, + core_lambda, + fin_op, + sqrt, + (uint32_t)numOfNN, + mutexes, + out_dists, + out_inds); } else { } CUDA_CHECK(cudaGetLastError()); } -template -void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2ExpKnn(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - fusedL2ExpKnnImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, - out_inds, numOfNN, stream, workspace, - worksize); + fusedL2ExpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - fusedL2ExpKnnImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, - out_inds, numOfNN, stream, workspace, - worksize); + fusedL2ExpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else { - fusedL2ExpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2ExpKnnImpl(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } } @@ -732,11 +950,19 @@ void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream stream to order kernel launch */ template -void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, size_t n_index_rows, - size_t n_query_rows, int k, bool rowMajorIndex, - bool rowMajorQuery, cudaStream_t stream, - raft::distance::DistanceType metric) { +void fusedL2Knn(size_t D, + value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + bool rowMajorIndex, + bool rowMajorQuery, + cudaStream_t stream, + raft::distance::DistanceType metric) +{ // Validate the input data ASSERT(k > 0, "l2Knn: k must be > 0"); ASSERT(D > 0, "l2Knn: D must be > 0"); @@ -750,8 +976,7 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, ASSERT(rowMajorIndex == rowMajorQuery, "l2Knn: rowMajorIndex and rowMajorQuery should have same layout"); // TODO: Add support for column major layout - ASSERT(rowMajorIndex == true, - "l2Knn: only rowMajor inputs are supported for now."); + ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now."); // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt. @@ -764,37 +989,82 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, switch (metric) { case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - tempWorksize = raft::distance::detail::getWorkspaceSize< - raft::distance::DistanceType::L2Expanded, float, float, float, - value_idx>(query, index, n_query_rows, n_index_rows, D); + tempWorksize = raft::distance::detail:: + getWorkspaceSize( + query, index, n_query_rows, n_index_rows, D); worksize = tempWorksize; workspace.resize(worksize, stream); - fusedL2ExpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2ExpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); if (worksize > tempWorksize) { workspace.resize(worksize, stream); - fusedL2ExpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2ExpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); } break; case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtUnexpanded: - fusedL2UnexpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2UnexpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); if (worksize) { workspace.resize(worksize, stream); - fusedL2UnexpKnn(n_query_rows, n_index_rows, D, lda, ldb, ldd, - query, index, sqrt, out_dists, out_inds, k, - stream, workspace.data(), worksize); + fusedL2UnexpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); } break; - default: - printf("only L2 distance metric is supported\n"); - break; + default: printf("only L2 distance metric is supported\n"); break; }; } diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 7d87254cb6..049c11514c 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -35,7 +35,8 @@ namespace knn { namespace detail { template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { +DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) +{ value_t sin_0 = sin(0.5 * (x1 - y1)); value_t sin_1 = sin(0.5 * (x2 - y2)); value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; @@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { * @param[in] n_index_rows number of rows in index array * @param[in] k number of closest neighbors to return */ -template -__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, int k) { +template +__global__ void haversine_knn_kernel(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - const value_t *query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; + const value_t* query_ptr = query + (blockIdx.x * 2); + value_t x1 = query_ptr[0]; + value_t x2 = query_ptr[1]; int i = threadIdx.x; for (; i < limit; i += tpb) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, // Handle last remainder fraction of a warp of elements if (i < n_index_rows) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; + out_inds[blockIdx.x * k + i] = smemV[i]; } } @@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, * @param[in] stream stream to order kernel launch */ template -void haversine_knn(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, size_t n_query_rows, int k, - cudaStream_t stream) { +void haversine_knn(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + cudaStream_t stream) +{ haversine_knn_kernel<<>>( out_inds, out_dists, index, query, n_index_rows, k); } diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index da1217e3cf..2866049188 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -46,13 +46,22 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, - value_t *outK, value_idx *outV, - size_t n_samples, int n_parts, - value_t initK, value_idx initV, int k, - value_idx *translations) { +template +__global__ void knn_merge_parts_kernel(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + value_t initK, + value_idx initV, + int k, + value_idx* translations) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; @@ -61,34 +70,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, /** * Uses shared memory */ - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available - int row = blockIdx.x; + int row = blockIdx.x; int total_k = k * n_parts; int i = threadIdx.x; // Get starting pointers for cols in current thread - int part = i / k; + int part = i / k; size_t row_idx = (row * k) + (part * n_samples * k); int col = i % k; - value_t *inKStart = inK + (row_idx + col); - value_idx *inVStart = inV + (row_idx + col); + value_t* inKStart = inK + (row_idx + col); + value_idx* inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); value_idx translation = 0; for (; i < limit; i += tpb) { translation = translations[part]; heap.add(*inKStart, (*inVStart) + translation); - part = (i + tpb) / k; + part = (i + tpb) / k; row_idx = (row * k) + (part * n_samples * k); col = (i + tpb) % k; @@ -111,22 +119,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, } } -template -inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +template +inline void knn_merge_parts_impl(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ auto grid = dim3(n_samples); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); auto kInit = faiss::gpu::Limits::getMax(); auto vInit = -1; knn_merge_parts_kernel - <<>>(inK, inV, outK, outV, n_samples, n_parts, - kInit, vInit, k, translations); + <<>>( + inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); CUDA_CHECK(cudaPeekAtLastError()); } @@ -145,10 +158,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ if (k == 1) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); @@ -197,26 +216,32 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm */ template -void brute_force_knn_impl(std::vector &input, - std::vector &sizes, IntType D, - float *search_items, IntType n, IdxType *res_I, - float *res_D, IntType k, cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors should be the same size"); - - std::vector *id_ranges; +void brute_force_knn_impl( + std::vector& input, + std::vector& sizes, + IntType D, + float* search_items, + IntType n, + IdxType* res_I, + float* res_D, + IntType k, + cudaStream_t userStream, + cudaStream_t* internalStreams = nullptr, + int n_int_streams = 0, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); + + std::vector* id_ranges; if (translations == nullptr) { // If we don't have explicit translations // for offsets of the indices, build them // from the local partitions - id_ranges = new std::vector(); + id_ranges = new std::vector(); IdxType total_n = 0; for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); @@ -232,11 +257,10 @@ void brute_force_knn_impl(std::vector &input, create_processor(metric, n, D, k, rowMajorQuery, userStream); query_metric_processor->preprocess(search_items); - std::vector>> metric_processors( - input.size()); + std::vector>> metric_processors(input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor(metric, sizes[i], D, k, - rowMajorQuery, userStream); + metric_processors[i] = + create_processor(metric, sizes[i], D, k, rowMajorQuery, userStream); metric_processors[i]->preprocess(input[i]); } @@ -244,14 +268,13 @@ void brute_force_knn_impl(std::vector &input, CUDA_CHECK(cudaGetDevice(&device)); rmm::device_uvector trans(id_ranges->size(), userStream); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), - userStream); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); rmm::device_uvector all_D(0, userStream); rmm::device_uvector all_I(0, userStream); - float *out_D = res_D; - IdxType *out_I = res_I; + float* out_D = res_D; + IdxType* out_I = res_I; if (input.size() > 1) { all_D.resize(input.size() * k * n, userStream); @@ -265,19 +288,28 @@ void brute_force_knn_impl(std::vector &input, if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { - float *out_d_ptr = out_D + (i * k * n); - IdxType *out_i_ptr = out_I + (i * k * n); + float* out_d_ptr = out_D + (i * k * n); + IdxType* out_i_ptr = out_I + (i * k * n); - cudaStream_t stream = - raft::select_stream(userStream, internalStreams, n_int_streams, i); + cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && (metric == raft::distance::DistanceType::L2Unexpanded || metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::L2Expanded || metric == raft::distance::DistanceType::L2SqrtExpanded)) { - fusedL2Knn(D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, - k, rowMajorIndex, rowMajorQuery, stream, metric); + fusedL2Knn(D, + out_i_ptr, + out_d_ptr, + input[i], + search_items, + sizes[i], + n, + k, + rowMajorIndex, + rowMajorQuery, + stream, + metric); } else { switch (metric) { case raft::distance::DistanceType::Haversine: @@ -286,8 +318,7 @@ void brute_force_knn_impl(std::vector &input, "Haversine distance requires 2 dimensions " "(latitude / longitude)."); - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], - n, k, stream); + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); break; default: faiss::MetricType m = build_faiss_metric(metric); @@ -298,18 +329,18 @@ void brute_force_knn_impl(std::vector &input, gpu_res.setDefaultStream(device, stream); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; + args.numVectors = sizes[i]; + args.queries = search_items; args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; /** * @todo: Until FAISS supports pluggable allocation strategies, @@ -333,8 +364,7 @@ void brute_force_knn_impl(std::vector &input, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); } // Perform necessary post-processing @@ -342,14 +372,12 @@ void brute_force_knn_impl(std::vector &input, metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / metricArg; + if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, - [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index b66ea025a2..f87fffc6cf 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -37,11 +37,11 @@ namespace knn { template class MetricProcessor { public: - virtual void preprocess(math_t *data) {} + virtual void preprocess(math_t* data) {} - virtual void revert(math_t *data) {} + virtual void revert(math_t* data) {} - virtual void postprocess(math_t *data) {} + virtual void postprocess(math_t* data) {} virtual ~MetricProcessor() = default; }; @@ -57,37 +57,57 @@ class CosineMetricProcessor : public MetricProcessor { rmm::device_uvector colsums_; public: - CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, - cudaStream_t stream) + CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream) : stream_(stream), colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), - k_(k) {} + k_(k) + { + } - void preprocess(math_t *data) { - raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, - raft::linalg::NormType::L2Norm, row_major_, stream_, + void preprocess(math_t* data) + { + raft::linalg::rowNorm(colsums_.data(), + data, + n_cols_, + n_rows_, + raft::linalg::NormType::L2Norm, + row_major_, + stream_, [] __device__(math_t in) { return sqrtf(in); }); raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, stream_); } - void revert(math_t *data) { + void revert(math_t* data) + { raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, stream_); } - void postprocess(math_t *data) { + void postprocess(math_t* data) + { raft::linalg::unaryOp( - data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, - stream_); + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); } ~CosineMetricProcessor() = default; @@ -98,41 +118,59 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { using cosine = CosineMetricProcessor; public: - CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), - means_(n_rows, stream) {} + CorrelationMetricProcessor( + size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream) + { + } - void preprocess(math_t *data) { + void preprocess(math_t* data) + { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; - raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, - (math_t)0.0, cosine::row_major_, true, + raft::linalg::reduce(means_.data(), + data, + cosine::n_cols_, + cosine::n_rows_, + (math_t)0.0, + cosine::row_major_, + true, cosine::stream_); raft::linalg::unaryOp( - means_.data(), means_.data(), cosine::n_rows_, + means_.data(), + means_.data(), + cosine::n_rows_, [=] __device__(math_t in) { return in * normalizer_const; }, cosine::stream_); - raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanCenter(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); CosineMetricProcessor::preprocess(data); } - void revert(math_t *data) { + void revert(math_t* data) + { CosineMetricProcessor::revert(data); - raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanAdd(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); } - void postprocess(math_t *data) { - CosineMetricProcessor::postprocess(data); - } + void postprocess(math_t* data) { CosineMetricProcessor::postprocess(data); } ~CorrelationMetricProcessor() = default; @@ -142,33 +180,30 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { template class DefaultMetricProcessor : public MetricProcessor { public: - void preprocess(math_t *data) {} + void preprocess(math_t* data) {} - void revert(math_t *data) {} + void revert(math_t* data) {} - void postprocess(math_t *data) {} + void postprocess(math_t* data) {} ~DefaultMetricProcessor() = default; }; template inline std::unique_ptr> create_processor( - distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream) { - MetricProcessor *mp = nullptr; + distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream) +{ + MetricProcessor* mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = - new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); break; case distance::DistanceType::CorrelationExpanded: - mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream); + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream); break; - default: - mp = new DefaultMetricProcessor(); + default: mp = new DefaultMetricProcessor(); } return std::unique_ptr>(mp); diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh index 045edad0e6..88fa58a4d7 100644 --- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh @@ -31,27 +31,33 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, - size_t n_cols, K *outK, IndexType *outV, - K initK, IndexType initV, int k) { +template +__global__ void select_k_kernel(K* inK, + IndexType* inV, + size_t n_rows, + size_t n_cols, + K* outK, + IndexType* outV, + K initK, + IndexType initV, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ K smemK[kNumWarps * warp_q]; __shared__ IndexType smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, - warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; - int i = threadIdx.x; + int i = threadIdx.x; - int idx = row * n_cols; - K *inKStart = inK + idx + i; - IndexType *inVStart = inV + idx + i; + int idx = row * n_cols; + K* inKStart = inK + idx + i; + IndexType* inVStart = inV + idx + i; // Whole warps must participate in the selection int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); @@ -78,27 +84,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, } } -template -inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, - size_t n_cols, value_t *outK, value_idx *outV, - bool select_min, int k, cudaStream_t stream) { +template +inline void select_k_impl(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ auto grid = dim3(n_rows); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); - auto kInit = select_min ? faiss::gpu::Limits::getMax() - : faiss::gpu::Limits::getMin(); + auto kInit = + select_min ? faiss::gpu::Limits::getMax() : faiss::gpu::Limits::getMin(); auto vInit = -1; if (select_min) { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } else { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } CUDA_CHECK(cudaGetLastError()); } @@ -118,30 +128,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, * @param[in] stream CUDA stream to use */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ if (k == 1) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 32) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 64) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 128) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 256) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 512) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 1024) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } }; // namespace detail diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh index 84719a0e4b..abc4cdf545 100644 --- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh @@ -30,21 +30,25 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructors - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp) + : key(kvp.key), value(kvp.value) + { + } - __host__ __device__ __forceinline__ - KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const& key, - Value const& value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -117,9 +121,9 @@ struct KeyValuePair { // // If IsBitonic is false, the first stage is reversed, so we don't // need to sort directionally. It's still technically a bitonic sort. -template -inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { +template +inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) +{ static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); static_assert(L <= kWarpSize / 2, "merge list size must be <= 16"); @@ -129,7 +133,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { // Reverse the first comparison stage. // For example, merging a list of size 8 has the exchanges: // 0 <-> 15, 1 <-> 14, ... - K otherK = shfl_xor(k, 2 * L - 1); + K otherK = shfl_xor(k, 2 * L - 1); K otherVk = shfl_xor(v.key, 2 * L - 1); V otherVv = shfl_xor(v.value, 2 * L - 1); @@ -157,7 +161,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { #pragma unroll for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) { - K otherK = shfl_xor(k, stride); + K otherK = shfl_xor(k, stride); K otherVk = shfl_xor(v.key, stride); V otherVv = shfl_xor(v.value, stride); @@ -183,9 +187,9 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { // Template for performing a bitonic merge of an arbitrary set of // registers -template -struct BitonicMergeStepKVP {}; +template +struct BitonicMergeStepKVP { +}; // // Power-of-2 merge specialization @@ -194,7 +198,8 @@ struct BitonicMergeStepKVP {}; // All merges eventually call this template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[1], KeyValuePair v[1]) { + static inline __device__ void merge(K k[1], KeyValuePair v[1]) + { // Use warp shuffles warpBitonicMergeLE16KVP(k[0], v[0]); } @@ -202,16 +207,17 @@ struct BitonicMergeStepKVP { template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(utils::isPowerOf2(N), "must be power of 2"); static_assert(N > 1, "must be N > 1"); #pragma unroll for (int i = 0; i < N / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + N / 2]; + K& kb = k[i + N / 2]; KeyValuePair& vb = v[i + N / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -226,18 +232,17 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - BitonicMergeStepKVP::merge(newK, - newV); + BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -248,18 +253,17 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i + N / 2]; - newV[i].key = v[i + N / 2].key; + newK[i] = k[i + N / 2]; + newV[i].key = v[i + N / 2].key; newV[i].value = v[i + N / 2].value; } - BitonicMergeStepKVP::merge(newK, - newV); + BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i + N / 2] = newK[i]; - v[i + N / 2].key = newV[i].key; + k[i + N / 2] = newK[i]; + v[i + N / 2].key = newV[i].key; v[i + N / 2].value = newV[i].value; } } @@ -273,7 +277,8 @@ struct BitonicMergeStepKVP { // Low recursion template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -281,10 +286,10 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -293,7 +298,7 @@ struct BitonicMergeStepKVP { swap(s, va.value, vb.value); } - constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; + constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; constexpr int kHighSize = kNextHighestPowerOf2 / 2; { K newK[kLowSize]; @@ -301,23 +306,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - constexpr bool kLowIsPowerOf2 = - utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -328,23 +336,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; + newK[i] = k[i + kLowSize]; + newV[i].key = v[i + kLowSize].key; newV[i].value = v[i + kLowSize].value; } - constexpr bool kHighIsPowerOf2 = - utils::isPowerOf2(kNextHighestPowerOf2 / 2); + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; + k[i + kLowSize] = newK[i]; + v[i + kLowSize].key = newV[i].key; v[i + kLowSize].value = newV[i].value; } } @@ -354,7 +365,8 @@ struct BitonicMergeStepKVP { // High recursion template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -362,10 +374,10 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -374,7 +386,7 @@ struct BitonicMergeStepKVP { swap(s, va.value, vb.value); } - constexpr int kLowSize = kNextHighestPowerOf2 / 2; + constexpr int kLowSize = kNextHighestPowerOf2 / 2; constexpr int kHighSize = N - kNextHighestPowerOf2 / 2; { K newK[kLowSize]; @@ -382,23 +394,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - constexpr bool kLowIsPowerOf2 = - utils::isPowerOf2(kNextHighestPowerOf2 / 2); + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -409,23 +424,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; + newK[i] = k[i + kLowSize]; + newV[i].key = v[i + kLowSize].key; newV[i].value = v[i + kLowSize].value; } - constexpr bool kHighIsPowerOf2 = - utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; + k[i + kLowSize] = newK[i]; + v[i + kLowSize].key = newV[i].key; v[i + kLowSize].value = newV[i].value; } } @@ -436,20 +454,20 @@ struct BitonicMergeStepKVP { /// i.e., merges a sorted k/v list of size kWarpSize * N1 with a /// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any /// value >= 1 -template +template inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], KeyValuePair v1[N1], K k2[N2], - KeyValuePair v2[N2]) { + KeyValuePair v2[N2]) +{ constexpr int kSmallestN = N1 < N2 ? N1 : N2; #pragma unroll for (int i = 0; i < kSmallestN; ++i) { - K& ka = k1[N1 - 1 - i]; + K& ka = k1[N1 - 1 - i]; KeyValuePair& va = v1[N1 - 1 - i]; - K& kb = k2[i]; + K& kb = k2[i]; KeyValuePair& vb = v2[i]; K otherKa; @@ -457,13 +475,13 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], if (FullMerge) { // We need the other values - otherKa = shfl_xor(ka, kWarpSize - 1); + otherKa = shfl_xor(ka, kWarpSize - 1); K otherVak = shfl_xor(va.key, kWarpSize - 1); V otherVav = shfl_xor(va.value, kWarpSize - 1); - otherVa = KeyValuePair(otherVak, otherVav); + otherVa = KeyValuePair(otherVak, otherVav); } - K otherKb = shfl_xor(kb, kWarpSize - 1); + K otherKb = shfl_xor(kb, kWarpSize - 1); K otherVbk = shfl_xor(vb.key, kWarpSize - 1); V otherVbv = shfl_xor(vb.value, kWarpSize - 1); @@ -487,12 +505,10 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], } } - BitonicMergeStepKVP::merge( - k1, v1); + BitonicMergeStepKVP::merge(k1, v1); if (FullMerge) { // Only if we care about N2 do we need to bother merging it fully - BitonicMergeStepKVP::merge(k2, v2); + BitonicMergeStepKVP::merge(k2, v2); } } @@ -500,7 +516,8 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], // bitonic sort template struct BitonicSortStepKVP { - static inline __device__ void sort(K k[N], KeyValuePair v[N]) { + static inline __device__ void sort(K k[N], KeyValuePair v[N]) + { static_assert(N > 1, "did not hit specialized case"); // Sort recursively @@ -512,8 +529,8 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeA; ++i) { - aK[i] = k[i]; - aV[i].key = v[i].key; + aK[i] = k[i]; + aV[i].key = v[i].key; aV[i].value = v[i].value; } @@ -524,8 +541,8 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeB; ++i) { - bK[i] = k[i + kSizeA]; - bV[i].key = v[i + kSizeA].key; + bK[i] = k[i + kSizeA]; + bV[i].key = v[i + kSizeA].key; bV[i].value = v[i + kSizeA].value; } @@ -536,15 +553,15 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeA; ++i) { - k[i] = aK[i]; - v[i].key = aV[i].key; + k[i] = aK[i]; + v[i].key = aV[i].key; v[i].value = aV[i].value; } #pragma unroll for (int i = 0; i < kSizeB; ++i) { - k[i + kSizeA] = bK[i]; - v[i + kSizeA].key = bV[i].key; + k[i + kSizeA] = bK[i]; + v[i + kSizeA].key = bV[i].key; v[i + kSizeA].value = bV[i].value; } } @@ -553,7 +570,8 @@ struct BitonicSortStepKVP { // Single warp (N == 1) sorting specialization template struct BitonicSortStepKVP { - static inline __device__ void sort(K k[1], KeyValuePair v[1]) { + static inline __device__ void sort(K k[1], KeyValuePair v[1]) + { // Update this code if this changes // should go from 1 -> kWarpSize in multiples of 2 static_assert(kWarpSize == 32, "unexpected warp size"); @@ -569,61 +587,64 @@ struct BitonicSortStepKVP { /// Sort a list of kWarpSize * N elements in registers, where N is an /// arbitrary >= 1 template -inline __device__ void warpSortAnyRegistersKVP(K k[N], - KeyValuePair v[N]) { +inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair v[N]) +{ BitonicSortStepKVP::sort(k, v); } // `Dir` true, produce largest values. // `Dir` false, produce smallest values. -template +template struct KeyValueWarpSelect { static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize; - __device__ inline KeyValueWarpSelect(K initKVal, - faiss::gpu::KeyValuePair initVVal, - int k) + __device__ inline KeyValueWarpSelect(K initKVal, faiss::gpu::KeyValuePair initVVal, int k) : initK(initKVal), initV(initVVal), numVals(0), warpKTop(initKVal), warpKTopRDist(initKVal), - kLane((k - 1) % faiss::gpu::kWarpSize) { - static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), - "threads must be a power-of-2"); - static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), - "warp queue must be power-of-2"); + kLane((k - 1) % faiss::gpu::kWarpSize) + { + static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); // Fill the per-thread queue keys with the default value #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initV.key; + threadK[i] = initK; + threadV[i].key = initV.key; threadV[i].value = initV.value; } // Fill the warp queue with the default value #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i] = initK; - warpV[i].key = initV.key; + warpK[i] = initK; + warpV[i].key = initV.key; warpV[i].value = initV.value; } } - __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair& v) { + __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair& v) + { if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { // Rotate right #pragma unroll for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i].key = threadV[i - 1].key; + threadK[i] = threadK[i - 1]; + threadV[i].key = threadV[i - 1].key; threadV[i].value = threadV[i - 1].value; } - threadK[0] = k; - threadV[0].key = v.key; + threadK[0] = k; + threadV[0].key = v.key; threadV[0].value = v.value; ++numVals; } @@ -633,33 +654,35 @@ struct KeyValueWarpSelect { /// list across both // TODO - __device__ inline void mergeWarpQ() { + __device__ inline void mergeWarpQ() + { // Sort all of the per-thread queues - faiss::gpu::warpSortAnyRegistersKVP(threadK, - threadV); + faiss::gpu::warpSortAnyRegistersKVP(threadK, threadV); // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list - faiss::gpu::warpMergeAnyRegistersKVP(warpK, warpV, - threadK, threadV); + faiss::gpu::warpMergeAnyRegistersKVP( + warpK, warpV, threadK, threadV); } /// WARNING: all threads in a warp must participate in this. /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, faiss::gpu::KeyValuePair& v) { + __device__ inline void add(K k, faiss::gpu::KeyValuePair& v) + { addThreadQ(k, v); checkThreadQ(); } - __device__ inline void reduce() { + __device__ inline void reduce() + { // Have all warps dump and merge their queues; this will produce // the final per-warp results mergeWarpQ(); } - __device__ inline void checkThreadQ() { + __device__ inline void checkThreadQ() + { bool needSort = (numVals == NumThreadQ); #if CUDA_VERSION >= 9000 @@ -681,18 +704,19 @@ struct KeyValueWarpSelect { #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initV.key; + threadK[i] = initK; + threadV[i].key = initV.key; threadV[i].value = initV.value; } // We have to beat at least this element warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane); - warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); + warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); } /// Dump final k selected values for this warp out - __device__ inline void writeOut(K* outK, V* outV, int k) { + __device__ inline void writeOut(K* outK, V* outV, int k) + { int laneId = faiss::gpu::getLaneId(); #pragma unroll diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a2e9151dbc..eb9a8f1436 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -52,12 +52,17 @@ using deviceAllocator = raft::mr::device::allocator; * @param translations */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { - detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, - translations); +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ + detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); } /** @@ -82,9 +87,16 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param stream */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } @@ -111,22 +123,41 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, * @param[in] translations starting offsets for partitions. should be the same size * as input vector. */ -inline void brute_force_knn( - raft::handle_t const &handle, std::vector &input, - std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, - float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true, - std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Expanded, - float metric_arg = 2.0f) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors must be the same size"); +inline void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + int64_t* res_I, + float* res_D, + int k, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Expanded, + float metric_arg = 2.0f) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); std::vector int_streams = handle.get_internal_streams(); - detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_stream(), int_streams.data(), - handle.get_num_internal_streams(), rowMajorIndex, - rowMajorQuery, translations, metric, metric_arg); + detail::brute_force_knn_impl(input, + sizes, + D, + search_items, + n, + res_I, + res_D, + k, + handle.get_stream(), + int_streams.data(), + handle.get_num_internal_streams(), + rowMajorIndex, + rowMajorQuery, + translations, + metric, + metric_arg); } } // namespace knn } // namespace spatial diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 6f507331d9..221a9679d4 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -24,8 +24,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct cluster_solver_config_t { size_type_t n_clusters; size_type_t maxIter; @@ -35,23 +34,35 @@ struct cluster_solver_config_t { unsigned long long seed{123456}; }; -template +template struct kmeans_solver_t { - explicit kmeans_solver_t(cluster_solver_config_t const& config) - : config_(config) {} - - std::pair solve( - handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } + + std::pair solve(handle_t const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol, - config_.maxIter, obs, codes, residual, iters, config_.seed); + kmeans(handle, + n_obs_vecs, + dim, + config_.n_clusters, + config_.tol, + config_.maxIter, + obs, + codes, + residual, + iters, + config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index e36dca2e0c..156b996586 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -23,8 +23,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct eigen_solver_config_t { size_type_t n_eigVecs; size_type_t maxIter; @@ -34,42 +33,59 @@ struct eigen_solver_config_t { bool reorthogonalize{false}; unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations }; -template +template struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config) - : config_(config) {} + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } - index_type_t solve_smallest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_smallest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, - eigVecs, config_.seed); + computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); return iters; } - index_type_t solve_largest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_largest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, eigVecs, + computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index d089b85518..18b23bea55 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -43,15 +43,15 @@ using namespace raft::linalg; // Useful grid settings // ========================================================= -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels // ========================================================= -/** +/** * @brief Compute distances between observation vectors and centroids * Block dimensions should be (warpSize, 1, * blockSize/warpSize). Ideally, the grid is large enough so there @@ -75,11 +75,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances( - index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) { +static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) +{ // Loop index index_type_t i; @@ -114,12 +116,10 @@ static __global__ void computeDistances( // Perform reduction on warp for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += - __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) - atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -134,8 +134,8 @@ static __global__ void computeDistances( } } -/** - * @brief Find closest centroid to observation vectors. +/** + * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -156,10 +156,12 @@ static __global__ void computeDistances( * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, index_type_t k, +static __global__ void minDistances(index_type_t n, + index_type_t k, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { + index_type_t* __restrict__ clusterSizes) +{ // Loop index index_type_t i, j; @@ -178,8 +180,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -194,8 +196,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, } } -/** - * @brief Check if newly computed distances are smaller than old distances. +/** + * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -218,7 +220,8 @@ static __global__ void minDistances2(index_type_t n, value_type_t* __restrict__ dists_old, const value_type_t* __restrict__ dists_new, index_type_t* __restrict__ codes_old, - index_type_t code_new) { + index_type_t code_new) +{ // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -243,7 +246,7 @@ static __global__ void minDistances2(index_type_t n, } } -/** +/** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. @@ -255,9 +258,10 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes( - index_type_t n, const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { +static __global__ void computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -265,8 +269,8 @@ static __global__ void computeClusterSizes( } } -/** - * @brief Divide rows of centroid matrix by cluster sizes. +/** + * @brief Divide rows of centroid matrix by cluster sizes. * Divides the ith column of the sum matrix by the size of the ith * cluster. If the sum matrix has been initialized so that the ith * row is the sum of all observation vectors in the ith cluster, @@ -287,9 +291,11 @@ static __global__ void computeClusterSizes( * column is the mean position of a cluster). */ template -static __global__ void divideCentroids( - index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) { +static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) +{ // Global indices index_type_t gidx, gidy; @@ -340,11 +346,14 @@ static __global__ void divideCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(handle_t const& handle, index_type_t n, - index_type_t d, value_type_t rand, +static int chooseNewCentroid(handle_t const& handle, + index_type_t n, + index_type_t d, + value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) { + value_type_t* __restrict__ centroid) +{ // Cumulative sum of distances value_type_t* distsCumSum = dists + n; // Residual sum of squares @@ -352,44 +361,44 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, // Observation vector that is chosen as new centroid index_type_t obsIndex; - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) // - //seg-faults due to Thrust bug - //on binary-search-like algorithms - //when run with stream dependent - //execution policies; fixed on Thrust GitHub - //hence replace w/ linear interpolation, - //until the Thrust issue gets resolved: + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: // // obsIndex = (thrust::lower_bound( // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - // thrust::device_pointer_cast(distsCumSum)); // - //linear interpolation logic: + // linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); } else { obsIndex = 0; } @@ -400,15 +409,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, + CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); return 0; } /** - * @brief Choose initial cluster centroids for k-means algorithm. + * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -432,11 +443,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, * @return Zero if successful. Otherwise non-zero. */ template -static int initializeCentroids( - handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ dists, unsigned long long seed) { +static int initializeCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -448,7 +465,7 @@ static int initializeCentroids( thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -461,35 +478,34 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), - 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid - thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), 1); + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); CHECK_CUDA(stream); if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids, dists); + computeDistances<<>>(n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, - centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -499,22 +515,20 @@ static int initializeCentroids( CHECK_CUDA(stream); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, - codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, codes, - clusterSizes); + computeClusterSizes<<>>(n, codes, clusterSizes); CHECK_CUDA(stream); return 0; } -/** - * @brief Find cluster centroids closest to observation vectors. +/** + * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -540,15 +554,18 @@ static int initializeCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(handle_t const& handle, index_type_t n, - index_type_t d, index_type_t k, +static int assignCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { - auto stream = handle.get_stream(); + value_type_t* residual_host) +{ + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors @@ -561,11 +578,9 @@ static int assignCentroids(handle_t const& handle, index_type_t n, constexpr index_type_t grid_lower_bound{65535}; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); gridDim.y = min(k, grid_lower_bound); - gridDim.z = - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - computeDistances<<>>(n, d, k, obs, centroids, - dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); // Find centroid closest to each observation vector @@ -573,23 +588,21 @@ static int assignCentroids(handle_t const& handle, index_type_t n, blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, - clusterSizes); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); CHECK_CUDA(stream); // Compute residual sum of squares - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); return 0; } -/** - * @brief Update cluster centroids for k-means algorithm. +/** + * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -613,26 +626,29 @@ static int assignCentroids(handle_t const& handle, index_type_t n, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(handle_t const& handle, index_type_t n, - index_type_t d, index_type_t k, +static int updateCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, const index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) { + index_type_t* __restrict__ work_int) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const value_type_t one = 1; + const value_type_t one = 1; const value_type_t zero = 0; constexpr index_type_t grid_lower_bound{65535}; - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Device memory @@ -641,34 +657,56 @@ static int updateCentroids(handle_t const& handle, index_type_t n, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (value_type_t*)NULL, n, - thrust::raw_pointer_cast(obs_copy), n, stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::modulus()); CHECK_CUDA(stream); - thrust::gather(thrust_exec_policy, rows, rows + d * n, - thrust::device_pointer_cast(codes), codes_copy); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); CHECK_CUDA(stream); @@ -679,12 +717,11 @@ static int updateCentroids(handle_t const& handle, index_type_t n, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; - divideCentroids<<>>(d, k, clusterSizes, - centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); return 0; @@ -698,8 +735,8 @@ namespace raft { // k-means algorithm // ========================================================= -/** - * @brief Find clusters with k-means algorithm. +/** + * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. @@ -735,15 +772,22 @@ namespace raft { * @return error flag. */ template -int kmeans(handle_t const& handle, index_type_t n, index_type_t d, - index_type_t k, value_type_t tol, index_type_t maxiter, +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, - value_type_t* residual_host, index_type_t* iters_host, - unsigned long long seed) { + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int, + value_type_t* residual_host, + index_type_t* iters_host, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -764,101 +808,93 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, // Initialization // ------------------------------------------------------- - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), - cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, - work, work_int)) + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), - grid_lower_bound)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, - work); + computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), - thrust::device_pointer_cast(work + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); CHECK_CUDA(stream); return 0; } if (n <= k) { - thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), n, 1); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, - work, seed)) + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, - work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, - clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work, - centroids + IDX(0, emptyCentroid, d))) + if (chooseNewCentroid( + handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, - clusterSizes, residual_host)) + if (assignCentroids( + handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); CHECK_CUDA(stream); } @@ -870,14 +906,13 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) - WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; return 0; } -/** +/** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with @@ -903,11 +938,18 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, * @return error flag */ template -int kmeans(handle_t const& handle, index_type_t n, index_type_t d, - index_type_t k, value_type_t tol, index_type_t maxiter, +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, value_type_t& residual, - index_type_t& iters, unsigned long long seed = 123456) { + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ using namespace matrix; // Check that parameters are valid @@ -924,9 +966,21 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( - handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), - centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); + return kmeans(handle, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d14bf05f37..35fc22c770 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -21,66 +21,125 @@ #include #include -//for now; TODO: check if/where this `define` should be; +// for now; TODO: check if/where this `define` should be; // #define USE_LAPACK namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status \ - << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } -extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, - float *work, int *lwork, int *info); -extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, - double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, - float *a, int *lda, const float *tau, float *c, - int *ldc, float *work, int *lwork, int *info); -extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, - double *a, int *lda, const double *tau, double *c, - int *ldc, double *work, int *lwork, int *info); -extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, - double *wr, double *wi, double *vl, int *ldvl, double *vr, - int *ldvr, double *work, int *lwork, int *info); - -extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, - float *wr, float *wi, float *vl, int *ldvl, float *vr, - int *ldvr, float *work, int *lwork, int *info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const float *alpha, const float *A, int lda, const float *B, int ldb, - const float *beta, float *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const double *alpha, const double *A, int lda, const double *B, int ldb, - const double *beta, double *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, - int n, float *d, float *e, - float *z, int ldz, float *work, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, - int n, double *d, double *e, - double *z, int ldz, - double *work, int *info); +extern "C" void sgeqrf_( + int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); +extern "C" void dgeqrf_( + int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); +extern "C" void sormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + float* a, + int* lda, + const float* tau, + float* c, + int* ldc, + float* work, + int* lwork, + int* info); +extern "C" void dormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + double* a, + int* lda, + const double* tau, + double* c, + int* ldc, + double* work, + int* lwork, + int* info); +extern "C" int dgeev_(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info); + +extern "C" int sgeev_(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost( + const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info); template class Lapack { @@ -91,182 +150,339 @@ class Lapack { public: static void check_lapack_enabled(); - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, + static void gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, int ldc); // special QR for lanczos - static void sterf(int n, T *d, T *e); - static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + static void sterf(int n, T* d, T* e); + static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); // QR // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. // multiply C by implicit Q - static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork); - - static void geev(T *A, T *eigenvalues, int dim, int lda); - static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + static void ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork); + + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, int ldvr); - static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, - T *eigenvectors_i, int dim, int lda, int ldvr); private: - static void lapack_gemm(const char transa, const char transb, int m, int n, - int k, float alpha, const float *a, int lda, - const float *b, int ldb, float beta, float *c, - int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float* a, + int lda, + const float* b, + int ldb, + float beta, + float* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, const signed char transb, - int m, int n, int k, double alpha, const double *a, - int lda, const double *b, int ldb, double beta, - double *c, int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (double *)a, lda, (double *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double* a, + int lda, + const double* b, + int ldb, + double beta, + double* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double*)a, + lda, + (double*)b, + ldb, + &beta, + c, + ldc); } - static void lapack_sterf(int n, float *d, float *e, int *info) { + static void lapack_sterf(int n, float* d, float* e, int* info) + { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) { + static void lapack_sterf(int n, double* d, double* e, int* info) + { cusolverDnDsterfHost(n, d, e, info); } - static void lapack_steqr(const signed char compz, int n, float *d, float *e, - float *z, int ldz, float *work, int *info) { + static void lapack_steqr( + const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) + { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, - double *z, int ldz, double *work, int *info) { + static void lapack_steqr(const signed char compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info) + { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, - float *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) + { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, - double *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) + { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, - int lda, float *tau, float *c, int ldc, float *work, - int *lwork, int *info) { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + float* c, + int ldc, + float* work, + int* lwork, + int* info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, - double *a, int lda, double *tau, double *c, int ldc, - double *work, int *lwork, int *info) { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + double* c, + int ldc, + double* work, + int* lwork, + int* info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, - int *lda, double *wr, double *wi, double *vl, - int *ldvl, double *vr, int *ldvr, - double *work, int *lwork, int *info) { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, - int *lda, float *wr, float *wi, float *vl, - int *ldvl, float *vr, int *ldvr, float *work, - int *lwork, int *info) { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } // real eigenvalues - static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { + static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) + { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T* vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, - &ldv, vl, &ldv, work.data(), &work_size, &info); + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); lapackCheckError(info); } // real eigenpairs - static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, - int lda, int ldvr) { + static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) + { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T* vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), - vl, &ldvl, eigenvectors, &ldvr, work.data(), - &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } // complex eigenpairs - static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, - int lda, int ldvr) { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, - eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, - work.data(), &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } }; template -void Lapack::check_lapack_enabled() { +void Lapack::check_lapack_enabled() +{ #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, - int ldc) { +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, + int ldc) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) { +void Lapack::sterf(int n, T* d, T* e) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -276,7 +492,8 @@ void Lapack::sterf(int n, T *d, T *e) { } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { +void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -286,8 +503,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, - int *lwork) { +void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -296,11 +513,22 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, #endif } template -void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork) { +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -310,7 +538,8 @@ void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { +void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -318,8 +547,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -327,13 +556,18 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, } // complex eigenpairs template -void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, - dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); #endif } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 42fc621a1a..9d1f899d66 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -41,10 +41,12 @@ using size_type = int; // for now; TODO: move it in appropriate header // Apply diagonal matrix to vector: // template -static __global__ void diagmv(IndexType_ n, ValueType_ alpha, +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, const ValueType_* __restrict__ D, const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) { + ValueType_* __restrict__ y) +{ IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { y[i] += alpha * D[i] * x[i]; @@ -59,7 +61,7 @@ enum struct sparse_mv_alg_t : int { SPARSE_MV_UNDEFINED = -1, SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices }; // Vector "view"-like aggregate for linear algebra purposes @@ -69,15 +71,14 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz) - : buffer_(buffer), size_(sz) {} + vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other) - : buffer_(other.raw()), size_(other.size()) {} + vector_view_t(vector_view_t&& other) : buffer_(other.raw()), size_(other.size()) {} - vector_view_t& operator=(vector_view_t&& other) { + vector_view_t& operator=(vector_view_t&& other) + { buffer_ = other.raw(); - size_ = other.size(); + size_ = other.size(); } }; @@ -85,8 +86,9 @@ template class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) - : buffer_(sz, raft_handle.get_stream()), - thrust_policy(raft_handle.get_thrust_policy()) {} + : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy()) + { + } size_type size(void) const { return buffer_.size(); } @@ -94,32 +96,40 @@ class vector_t { value_type const* raw(void) const { return buffer_.data(); } - value_type nrm1() const { - return thrust::reduce(thrust_policy, buffer_.data(), - buffer_.data() + buffer_.size(), value_type{0}, + value_type nrm1() const + { + return thrust::reduce(thrust_policy, + buffer_.data(), + buffer_.data() + buffer_.size(), + value_type{0}, [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; + auto abs_left = left > 0 ? left : -left; auto abs_right = right > 0 ? right : -right; return abs_left + abs_right; }); } - void fill(value_type value) { + void fill(value_type value) + { thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); } private: - using thrust_exec_policy_t = thrust::detail::execute_with_allocator< - rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; + using thrust_exec_policy_t = + thrust::detail::execute_with_allocator, + thrust::cuda_cub::execute_on_stream_base>; rmm::device_uvector buffer_; const thrust_exec_policy_t thrust_policy; }; template struct sparse_matrix_t { - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const ncols, + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const ncols, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), @@ -127,18 +137,25 @@ struct sparse_matrix_t { values_(values), nrows_(nrows), ncols_(ncols), - nnz_(nnz) {} + nnz_(nnz) + { + } - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), ncols_(nrows), - nnz_(nnz) {} + nnz_(nnz) + { + } template sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) @@ -148,7 +165,9 @@ struct sparse_matrix_t { values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) {} + nnz_(csr_view.number_of_edges) + { + } virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types @@ -158,21 +177,24 @@ struct sparse_matrix_t { // descriptor creation works with non-const, and const-casting // down is dangerous) // - virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + virtual void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const { + bool transpose = false, + bool symmetric = false) const + { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); - auto stream = handle_.get_stream(); + auto stream = handle_.get_stream(); - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP auto size_x = transpose ? nrows_ : ncols_; @@ -180,15 +202,19 @@ struct sparse_matrix_t { cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - //create descriptors: + // create descriptors: //(below casts are necessary, because // cusparseCreateCsr(...) takes non-const // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr( - &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), - const_cast(col_indices_), const_cast(values_))); + CUSPARSE_CHECK(cusparsecreatecsr(&matA, + nrows_, + ncols_, + nnz_, + const_cast(row_offsets_), + const_cast(col_indices_), + const_cast(values_))); cusparseDnVecDescr_t vecX; CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); @@ -196,31 +222,29 @@ struct sparse_matrix_t { cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); - //get (scratch) external device buffer size: + // get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, - vecX, &beta, vecY, spmv_alg, - &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); - //allocate external buffer: + // allocate external buffer: // vector_t external_buffer(handle_, bufferSize); - //finally perform SpMV: + // finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, spmv_alg, external_buffer.raw(), stream)); + CUSPARSE_CHECK(cusparsespmv( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); - //free descriptors: + // free descriptors: //(TODO: maybe wrap them in a RAII struct?) // CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -229,9 +253,20 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, - &alpha, descr, values_, row_offsets_, - col_indices_, x, &beta, y, stream)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, + trans, + nrows_, + ncols_, + nnz_, + &alpha, + descr, + values_, + row_offsets_, + col_indices_, + x, + &beta, + y, + stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } @@ -239,19 +274,18 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const + { switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: - return CUSPARSE_CSRMV_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: - return CUSPARSE_CSRMV_ALG2; - default: - return CUSPARSE_MV_ALG_DEFAULT; + case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2; + default: return CUSPARSE_MV_ALG_DEFAULT; } } #endif - //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, + // aggregate handle_t const& handle_; index_type const* row_offsets_; @@ -264,43 +298,51 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) - : sparse_matrix_t(raft_handle, row_offsets, - col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) { + laplacian_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) + : sparse_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) + { vector_t ones{raft_handle, nrows}; ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } laplacian_matrix_t(handle_t const& raft_handle, sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, csr_m.row_offsets_, - csr_m.col_indices_, csr_m.values_, - csr_m.nrows_, csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) { + : sparse_matrix_t(raft_handle, + csr_m.row_offsets_, + csr_m.col_indices_, + csr_m.values_, + csr_m.nrows_, + csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) + { vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; + auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // scales y by beta: // @@ -312,8 +354,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim{ - std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); @@ -321,8 +362,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); } vector_t diagonal_; @@ -332,52 +372,66 @@ template struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : laplacian_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz) { + raft_handle, row_offsets, col_indices, values, nrows, nnz) + { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } modularity_matrix_t(handle_t const& raft_handle, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, csr_m) { + : laplacian_matrix_t(raft_handle, csr_m) + { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); value_type dot_res; // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot( - cublas_h, n, laplacian_matrix_t::diagonal_.raw(), - 1, x, 1, &dot_res, stream)); + CUBLAS_CHECK(linalg::cublasdot(cublas_h, + n, + laplacian_matrix_t::diagonal_.raw(), + 1, + x, + 1, + &dot_res, + stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy( - cublas_h, n, &gamma_, - laplacian_matrix_t::diagonal_.raw(), 1, y, 1, - stream)); + CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, + n, + &gamma_, + laplacian_matrix_t::diagonal_.raw(), + 1, + y, + 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index fededbfcb4..0e0e47ddf3 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -39,7 +39,8 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) { +static double timer(void) +{ struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -78,17 +79,21 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); std::tuple @@ -102,11 +107,10 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, n, nEigVecs, eigVecs); @@ -117,8 +121,7 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -137,11 +140,12 @@ std::tuple modularity_maximization( * @param modularity On exit, modularity */ template -void analyzeModularity(handle_t const &handle, - sparse_matrix_t const &csr_m, +void analyzeModularity(handle_t const& handle, + sparse_matrix_t const& csr_m, vertex_t nClusters, - vertex_t const *__restrict__ clusters, - weight_t &modularity) { + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; @@ -149,15 +153,14 @@ void analyzeModularity(handle_t const &handle, weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory vector_t part_i(handle, n); vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, csr_m}; @@ -167,8 +170,7 @@ void analyzeModularity(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, i, n, clustersize, partModularity, - clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 2df3812a4a..b52bfcc0d6 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -61,21 +61,25 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template -std::tuple partition( - handle_t const &handle, sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { +template +std::tuple partition(handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + // cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; @@ -86,22 +90,20 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -128,16 +130,19 @@ std::tuple partition( * @return error flag. */ template -void analyzePartition(handle_t const &handle, - sparse_matrix_t const &csr_m, - vertex_t nClusters, const vertex_t *__restrict__ clusters, - weight_t &edgeCut, weight_t &cost) { +void analyzePartition(handle_t const& handle, + sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; vertex_t n = csr_m.nrows_; - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); weight_t partEdgesCut, clustersize; @@ -147,22 +152,20 @@ void analyzePartition(handle_t const &handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, csr_m}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, - part_i, Lx, L)) { + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index c148350c0f..44b4af4bdc 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -27,20 +27,18 @@ namespace raft { namespace spectral { template -static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, - value_type_t* obs) { +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t i, j, k, index, mm; value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -65,17 +63,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -index_type_t next_pow2(index_type_t n) { +index_type_t next_pow2(index_type_t n) +{ index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -89,7 +87,8 @@ index_type_t next_pow2(index_type_t n) { } template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t p2m; // find next power of 2 @@ -101,17 +100,16 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel - <<>>(m, n, obs); + scale_obs_kernel<<>>(m, n, obs); return cudaSuccess; } template -void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, - weight_t* eigVecs) { - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); +void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs) +{ + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); const weight_t zero{0.0}; @@ -121,9 +119,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, @@ -134,8 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::minus()); CHECK_CUDA(stream); - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -152,16 +149,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); } } @@ -176,9 +182,9 @@ struct equal_to_i_op { public: equal_to_i_op(index_type_t _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace @@ -186,38 +192,38 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // template -bool construct_indicator(handle_t const& handle, edge_t index, edge_t n, - weight_t& clustersize, weight_t& partStats, +bool construct_indicator(handle_t const& handle, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, - vector_t& part_i, vector_t& Bx, - laplacian_matrix_t const& B) { - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + vector_t& part_i, + vector_t& Bx, + laplacian_matrix_t const& B) +{ + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); - if (clustersize < 0.5) { - return false; - } + if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK( - cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp index 406f1b7c7e..08a4e6efb5 100644 --- a/cpp/include/raft/spectral/warn_dbg.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -4,13 +4,13 @@ #include #define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) -//nope: +// nope: // #define WARNING(message) \ do { \ diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 1b338a035a..e8e6bea4dd 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -47,8 +47,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -58,30 +58,26 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } } template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor - <<>>(mu, data, D, N); + meanKernelRowMajor<<>>(mu, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { - meanKernelColMajor - <<>>(mu, data, D, N); + meanKernelColMajor<<>>(mu, data, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index e8917a60b3..42351269ea 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColPerBlk has been tested only for 32! template -__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, - IdxType N) { +__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) { Type val = (colId < D) ? data[i * D + colId] : Type(0); thread_data += val * val; @@ -49,41 +49,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, } template -__global__ void stddevKernelColMajor(Type *std, const Type *data, - const Type *mu, IdxType D, IdxType N) { +__global__ void stddevKernelColMajor( + Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - std[blockIdx.x] = raft::mySqrt(acc / N); - } + if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } } template -__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, - IdxType D, IdxType N) { +__global__ void varsKernelColMajor( + Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - var[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } } /** @@ -105,70 +103,78 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(std, data, D, N); + stddevKernelRowMajor<<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - std, std, mu, D, - [ratio] __device__(Type a, Type b) { - return raft::mySqrt(a * ratio - b * b); - }, + std, + std, + mu, + D, + [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, stream); } else { - stddevKernelColMajor - <<>>(std, data, mu, D, N); + stddevKernelColMajor<<>>(std, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } /** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param var the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ + * @brief Compute variance of the input matrix + * + * Variance operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param var the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(var, data, D, N); + stddevKernelRowMajor<<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - var, var, mu, D, - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); } else { - varsKernelColMajor - <<>>(var, data, mu, D, N); + varsKernelColMajor<<>>(var, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index 37a3313ed1..b7f5cc8ff7 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -47,8 +47,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -58,27 +58,23 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } } template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { - sumKernelColMajor - <<>>(output, input, D, N); + sumKernelColMajor<<>>(output, input, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp index 6e4cf39850..ba1eb55e71 100644 --- a/cpp/include/raft/stats/mean.hpp +++ b/cpp/include/raft/stats/mean.hpp @@ -41,8 +41,9 @@ namespace stats { * @param stream: cuda stream */ template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ detail::mean(mu, data, D, N, sample, rowMajor, stream); } diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp index 04934d4388..c0ba24312b 100644 --- a/cpp/include/raft/stats/mean_center.hpp +++ b/cpp/include/raft/stats/mean_center.hpp @@ -38,12 +38,25 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void meanCenter(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } /** @@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, * @param stream cuda stream where to launch work */ template -void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { +void meanAdd(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } }; // end namespace stats diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp index 17c5ae457d..9393dec8bc 100644 --- a/cpp/include/raft/stats/stddev.hpp +++ b/cpp/include/raft/stats/stddev.hpp @@ -42,8 +42,15 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ detail::stddev(std, data, mu, D, N, sample, rowMajor, stream); } @@ -66,8 +73,15 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, * @param stream cuda stream where to launch work */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ detail::vars(var, data, mu, D, N, sample, rowMajor, stream); } diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp index 4f67acdf36..cfb5142a14 100644 --- a/cpp/include/raft/stats/sum.hpp +++ b/cpp/include/raft/stats/sum.hpp @@ -38,8 +38,8 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ detail::sum(output, input, D, N, rowMajor, stream); } diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index ceffbcca78..b44d8bb4ad 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -22,11 +22,11 @@ namespace raft { template -struct IOType {}; +struct IOType { +}; template <> struct IOType { - static_assert(sizeof(bool) == sizeof(int8_t), - "IOType bool size assumption failed"); + static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed"); typedef int8_t Type; }; template <> @@ -215,50 +215,50 @@ struct IOType { }; /** - * @struct TxN_t - * - * @brief Internal data structure that is used to define a facade for vectorized - * loads/stores across the most common POD types. The goal of his file is to - * provide with CUDA programmers, an easy way to have compiler issue vectorized - * load or store instructions to memory (either global or shared). Vectorized - * accesses to memory are important as they'll utilize its resources - * efficiently, - * when compared to their non-vectorized counterparts. Obviously, for whatever - * reasons if one is unable to issue such vectorized operations, one can always - * fallback to using POD types. - * - * Concept of vectorized accesses : Threads process multiple elements - * to speed up processing. These are loaded in a single read thanks - * to type promotion. It is then reinterpreted as a vector elements - * to perform the kernel's work. - * - * Caution : vectorized accesses requires input adresses to be memory aligned - * according not to the input type but to the promoted type used for reading. - * - * Example demonstrating the use of load operations, performing math on such - * loaded data and finally storing it back. - * @code{.cu} - * TxN_t mydata1, mydata2; - * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; - * mydata1.load(ptr1, idx); - * mydata2.load(ptr2, idx); - * #pragma unroll - * for(int i=0;i type. - * Only change required is to replace variable declaration appropriately. - * - * Obviously, it's caller's responsibility to take care of pointer alignment! - * - * @tparam math_ the data-type in which the compute/math needs to happen - * @tparam veclen_ the number of 'math_' types to be loaded/stored per - * instruction - */ + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Concept of vectorized accesses : Threads process multiple elements + * to speed up processing. These are loaded in a single read thanks + * to type promotion. It is then reinterpreted as a vector elements + * to perform the kernel's work. + * + * Caution : vectorized accesses requires input adresses to be memory aligned + * according not to the input type but to the promoted type used for reading. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ template struct TxN_t { /** underlying math data type */ @@ -282,7 +282,8 @@ struct TxN_t { * @brief Fill the contents of this structure with a constant value * @param _val the constant to be filled */ - DI void fill(math_t _val) { + DI void fill(math_t _val) + { #pragma unroll for (int i = 0; i < Ratio; ++i) { val.data[i] = _val; @@ -307,21 +308,24 @@ struct TxN_t { * @{ */ template - DI void load(const math_t *ptr, idx_t idx) { - const io_t *bptr = reinterpret_cast(&ptr[idx]); - val.internal = __ldg(bptr); + DI void load(const math_t* ptr, idx_t idx) + { + const io_t* bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); } template - DI void load(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); + DI void load(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); val.internal = *bptr; } template - DI void store(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); - *bptr = val.internal; + DI void store(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; } /** @} */ }; @@ -338,11 +342,17 @@ struct TxN_t { DI void fill(math_t _val) {} template - DI void load(const math_t *ptr, idx_t idx) {} + DI void load(const math_t* ptr, idx_t idx) + { + } template - DI void load(math_t *ptr, idx_t idx) {} + DI void load(math_t* ptr, idx_t idx) + { + } template - DI void store(math_t *ptr, idx_t idx) {} + DI void store(math_t* ptr, idx_t idx) + { + } }; } // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 06b246d9a1..2c7996514a 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, ClusterSolvers) { +TEST(Raft, ClusterSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) { index_type d{10}; index_type k{5}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvecs{nullptr}; index_type* codes{nullptr}; @@ -52,7 +53,8 @@ TEST(Raft, ClusterSolvers) { EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } -TEST(Raft, ModularitySolvers) { +TEST(Raft, ModularitySolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -66,7 +68,7 @@ TEST(Raft, ModularitySolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -80,13 +82,11 @@ TEST(Raft, ModularitySolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; EXPECT_ANY_THROW(spectral::modularity_maximization( h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index c14d880efd..150767992f 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, Utils) { +TEST(Raft, Utils) +{ ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index efa1e2cd41..21d7e9d753 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -26,30 +26,42 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x, - const DataType *y, int m, int n, int k, - DataType eps, bool isRowMajor) { +__global__ void naiveDistanceAdjKernel(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc <= eps; } template -void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m, - int n, int k, DataType eps, bool isRowMajor) { +void naiveDistanceAdj(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); - naiveDistanceAdjKernel - <<>>(dist, x, y, m, n, k, eps, isRowMajor); + naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); CUDA_CHECK(cudaPeekAtLastError()); } @@ -62,26 +74,28 @@ struct DistanceAdjInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceAdjInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs& dims) +{ return os; } template -class DistanceAdjTest - : public ::testing::TestWithParam> { +class DistanceAdjTest : public ::testing::TestWithParam> { public: DistanceAdjTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), dist(params.m * params.n, stream), - dist_ref(params.m * params.n, stream) {} + dist_ref(params.m * params.n, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; bool isRowMajor = params.isRowMajor; rmm::device_uvector x(m * k, stream); @@ -92,21 +106,27 @@ class DistanceAdjTest DataType threshold = params.eps; - naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, - isRowMajor); - size_t worksize = - raft::distance::getWorkspaceSize( + naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor); + size_t worksize = raft::distance:: + getWorkspaceSize( x.data(), y.data(), m, n, k); rmm::device_uvector workspace(worksize, stream); auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { return d_val <= threshold; }; - raft::distance::distance( - x.data(), y.data(), dist.data(), m, n, k, workspace.data(), - workspace.size(), fin_op, stream, isRowMajor); + raft::distance::distance( + x.data(), + y.data(), + dist.data(), + m, + n, + k, + workspace.data(), + workspace.size(), + fin_op, + stream, + isRowMajor); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -131,14 +151,13 @@ const std::vector> inputsf = { {10.0f, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestF; -TEST_P(DistanceAdjTestF, Result) { +TEST_P(DistanceAdjTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); + ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.01, 1024, 1024, 32, true, 1234ULL}, @@ -151,14 +170,13 @@ const std::vector> inputsd = { {10.0, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestD; -TEST_P(DistanceAdjTestD, Result) { +TEST_P(DistanceAdjTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); + ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd)); } // namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu index bddfdff3b6..db318605b4 100644 --- a/cpp/test/distance/dist_canberra.cu +++ b/cpp/test/distance/dist_canberra.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceCanberra - : public DistanceTest {}; +class DistanceCanberra : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraF; -TEST_P(DistanceCanberraF, Result) { +TEST_P(DistanceCanberraF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraD; -TEST_P(DistanceCanberraD, Result) { +TEST_P(DistanceCanberraD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu index 0dc6edfaad..c7dccfe712 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_chebyshev.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceLinf - : public DistanceTest {}; +class DistanceLinf : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfF; -TEST_P(DistanceLinfF, Result) { +TEST_P(DistanceLinfF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfD; -TEST_P(DistanceLinfD, Result) { +TEST_P(DistanceLinfD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu index f6dc015738..0648ed96ca 100644 --- a/cpp/test/distance/dist_correlation.cu +++ b/cpp/test/distance/dist_correlation.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceCorrelation - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCorrelation DistanceCorrelationF; -TEST_P(DistanceCorrelationF, Result) { +TEST_P(DistanceCorrelationF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCorrelation DistanceCorrelationD; -TEST_P(DistanceCorrelationD, Result) { +TEST_P(DistanceCorrelationD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu index 2487bcbd95..b3e6a4c97f 100644 --- a/cpp/test/distance/dist_cos.cu +++ b/cpp/test/distance/dist_cos.cu @@ -21,9 +21,8 @@ namespace raft { namespace distance { template -class DistanceExpCos - : public DistanceTest {}; +class DistanceExpCos : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosF; -TEST_P(DistanceExpCosF, Result) { +TEST_P(DistanceExpCosF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosD; -TEST_P(DistanceExpCosD, Result) { +TEST_P(DistanceExpCosD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu index a6ef01aa45..75ff7e682a 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_euc_exp.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceEucExpTest - : public DistanceTest {}; +class DistanceEucExpTest : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestF; -TEST_P(DistanceEucExpTestF, Result) { +TEST_P(DistanceEucExpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestD; -TEST_P(DistanceEucExpTestD, Result) { +TEST_P(DistanceEucExpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu index 290abda352..88affa16d5 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_euc_unexp.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestF; -TEST_P(DistanceEucUnexpTestF, Result) { +TEST_P(DistanceEucUnexpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestD; -TEST_P(DistanceEucUnexpTestD, Result) { +TEST_P(DistanceEucUnexpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu index 0123c8bada..631adc751c 100644 --- a/cpp/test/distance/dist_hamming.cu +++ b/cpp/test/distance/dist_hamming.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHamming - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHamming DistanceHammingF; -TEST_P(DistanceHammingF, Result) { +TEST_P(DistanceHammingF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHamming DistanceHammingD; -TEST_P(DistanceHammingD, Result) { +TEST_P(DistanceHammingD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu index 39d197f786..8a07c8836f 100644 --- a/cpp/test/distance/dist_hellinger.cu +++ b/cpp/test/distance/dist_hellinger.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHellingerExp - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpF; -TEST_P(DistanceHellingerExpF, Result) { +TEST_P(DistanceHellingerExpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpD; -TEST_P(DistanceHellingerExpD, Result) { +TEST_P(DistanceHellingerExpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu index 9070ce92c1..3cda31a852 100644 --- a/cpp/test/distance/dist_jensen_shannon.cu +++ b/cpp/test/distance/dist_jensen_shannon.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceJensenShannon DistanceJensenShannonF; -TEST_P(DistanceJensenShannonF, Result) { +TEST_P(DistanceJensenShannonF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceJensenShannon DistanceJensenShannonD; -TEST_P(DistanceJensenShannonD, Result) { +TEST_P(DistanceJensenShannonD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu index 7c32596527..4303b8cc8f 100644 --- a/cpp/test/distance/dist_kl_divergence.cu +++ b/cpp/test/distance/dist_kl_divergence.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceKLDivergence DistanceKLDivergenceF; -TEST_P(DistanceKLDivergenceF, Result) { +TEST_P(DistanceKLDivergenceF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceKLDivergence DistanceKLDivergenceD; -TEST_P(DistanceKLDivergenceD, Result) { +TEST_P(DistanceKLDivergenceD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu index ff7705d195..dad160ca41 100644 --- a/cpp/test/distance/dist_l1.cu +++ b/cpp/test/distance/dist_l1.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceUnexpL1 - : public DistanceTest {}; +class DistanceUnexpL1 : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1F; -TEST_P(DistanceUnexpL1F, Result) { +TEST_P(DistanceUnexpL1F, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1D; -TEST_P(DistanceUnexpL1D, Result) { +TEST_P(DistanceUnexpL1D, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu index 7d87bbc2c7..34f6d2825e 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_minkowski.cu @@ -21,8 +21,7 @@ namespace raft { namespace distance { template -class DistanceLpUnexp - : public DistanceTest { +class DistanceLpUnexp : public DistanceTest { }; const std::vector> inputsf = { @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, }; typedef DistanceLpUnexp DistanceLpUnexpF; -TEST_P(DistanceLpUnexpF, Result) { +TEST_P(DistanceLpUnexpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, }; typedef DistanceLpUnexp DistanceLpUnexpD; -TEST_P(DistanceLpUnexpD, Result) { +TEST_P(DistanceLpUnexpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu index ae735951a8..e0bfcd7eb3 100644 --- a/cpp/test/distance/dist_russell_rao.cu +++ b/cpp/test/distance/dist_russell_rao.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceRussellRao - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceRussellRao DistanceRussellRaoF; -TEST_P(DistanceRussellRaoF, Result) { +TEST_P(DistanceRussellRaoF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceRussellRao DistanceRussellRaoD; -TEST_P(DistanceRussellRaoD, Result) { +TEST_P(DistanceRussellRaoD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index f31fbc9165..f445e3b578 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -25,43 +25,52 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, int k, +__global__ void naiveDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, raft::distance::DistanceType type, - bool isRowMajor) { + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) acc = raft::mySqrt(acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveL1_Linf_CanberraDistanceKernel( - DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, - raft::distance::DistanceType type, bool isRowMajor) { +__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { acc = raft::myMax(acc, diff); @@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel( } } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCosineDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } - DataType acc_a = DataType(0); - DataType acc_b = DataType(0); + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_a += a * a; acc_b += b * b; acc_ab += a * b; @@ -106,64 +113,67 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = - (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } template -__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHellingerDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_ab += raft::mySqrt(a) * raft::mySqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - acc_ab = 1 - acc_ab; + acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); } template -__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor, DataType p) { +__global__ void naiveLpUnexpDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + bool isRowMajor, + DataType p) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = raft::L1Op()(a - b); acc += raft::myPow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; - dist[outidx] = acc; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; } template -__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHammingDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -171,21 +181,19 @@ __global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc += (a != b); } - acc = acc / k; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = acc / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveJensenShannonDistanceKernel(DataType *dist, - const DataType *x, - const DataType *y, int m, - int n, int k, - bool isRowMajor) { +__global__ void naiveJensenShannonDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -193,10 +201,10 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; - DataType m = 0.5f * (a + b); + DataType m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -206,18 +214,17 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist, bool p_zero = p == 0; bool q_zero = q == 0; - acc += - (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); + acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); } - acc = raft::mySqrt(0.5f * acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = raft::mySqrt(0.5f * acc); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveRussellRaoDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -225,56 +232,55 @@ __global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc += (a * b); } - acc = (k - acc) / k; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = (k - acc) / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveKLDivergenceDistanceKernel(OutType *dist, - const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveKLDivergenceDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; OutType acc = OutType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; - bool b_zero = (b == 0); - const auto m = (!b_zero) * (a / b); + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + bool b_zero = (b == 0); + const auto m = (!b_zero) * (a / b); const bool m_zero = (m == 0); acc += (a * (!m_zero) * log(m + m_zero)); } - acc = 0.5f * acc; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = 0.5f * acc; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCorrelationDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; - OutType acc = OutType(0); - auto a_norm = DataType(0); - auto b_norm = DataType(0); + OutType acc = OutType(0); + auto a_norm = DataType(0); + auto b_norm = DataType(0); auto a_sq_norm = DataType(0); auto b_sq_norm = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; a_norm += a; b_norm += b; a_sq_norm += (a * a); @@ -282,20 +288,27 @@ __global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x, acc += (a * b); } - auto numer = k * acc - (a_norm * b_norm); + auto numer = k * acc - (a_norm * b_norm); auto Q_denom = k * a_sq_norm - (a_norm * a_norm); auto R_denom = k * b_sq_norm - (b_norm * b_norm); acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, - int n, int k, raft::distance::DistanceType type, - bool isRowMajor, DataType metric_arg = 2.0f) { +void naiveDistance(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); @@ -310,43 +323,34 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - naiveDistanceKernel - <<>>(dist, x, y, m, n, k, type, isRowMajor); + naiveDistanceKernel<<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - naiveCosineDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCosineDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - naiveHellingerDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHellingerDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; case raft::distance::DistanceType::HammingUnexpanded: - naiveHammingDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHammingDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::JensenShannon: - naiveJensenShannonDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveJensenShannonDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::RusselRaoExpanded: - naiveRussellRaoDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveRussellRaoDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::KLDivergence: - naiveKLDivergenceDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveKLDivergenceDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::CorrelationExpanded: - naiveCorrelationDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCorrelationDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; - default: - FAIL() << "should be here\n"; + default: FAIL() << "should be here\n"; } CUDA_CHECK(cudaPeekAtLastError()); } @@ -361,24 +365,33 @@ struct DistanceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceInputs& dims) +{ return os; } template -void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, - int m, int n, int k, DistanceInputs ¶ms, - DataType threshold, char *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor, - DataType metric_arg = 2.0f) { +void distanceLauncher(DataType* x, + DataType* y, + DataType* dist, + DataType* dist2, + int m, + int n, + int k, + DistanceInputs& params, + DataType threshold, + char* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } template @@ -391,23 +404,25 @@ class DistanceTest : public ::testing::TestWithParam> { y(params.n * params.k, stream), dist_ref(params.m * params.n, stream), dist(params.m * params.n, stream), - dist2(params.m * params.n, stream) {} + dist2(params.m * params.n, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; DataType metric_arg = params.metric_arg; - bool isRowMajor = params.isRowMajor; + bool isRowMajor = params.isRowMajor; if (distanceType == raft::distance::DistanceType::HellingerExpanded || distanceType == raft::distance::DistanceType::JensenShannon || distanceType == raft::distance::DistanceType::KLDivergence) { // Hellinger works only on positive numbers r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream); - } else if (distanceType == - raft::distance::DistanceType::RusselRaoExpanded) { + } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) { r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream); // Russel rao works on boolean values. @@ -417,17 +432,27 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, - isRowMajor, metric_arg); - size_t worksize = - raft::distance::getWorkspaceSize(x.data(), y.data(), m, n, k); + naiveDistance( + dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg); + size_t worksize = raft::distance::getWorkspaceSize( + x.data(), y.data(), m, n, k); rmm::device_uvector workspace(worksize, stream); DataType threshold = -10000.f; - distanceLauncher( - x.data(), y.data(), dist.data(), dist2.data(), m, n, k, params, threshold, - workspace.data(), workspace.size(), stream, isRowMajor, metric_arg); + distanceLauncher(x.data(), + y.data(), + dist.data(), + dist2.data(), + m, + n, + k, + params, + threshold, + workspace.data(), + workspace.size(), + stream, + isRowMajor, + metric_arg); CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 33782baf8d..932857c536 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -30,40 +30,40 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, - DataT *y, int m, int n, int k, int *workspace, - DataT maxVal) { - int midx = threadIdx.y + blockIdx.y * blockDim.y; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void naiveKernel(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + DataT maxVal) +{ + int midx = threadIdx.y + blockIdx.y * blockDim.y; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; DataT acc = DataT(0); for (int i = 0; i < k; ++i) { - int xidx = i + midx * k; - int yidx = i + nidx * k; + int xidx = i + midx * k; + int yidx = i + nidx * k; auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { - acc = raft::mySqrt(acc); - } + if (Sqrt) { acc = raft::mySqrt(acc); } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; cub::KeyValuePair tmp; - tmp.key = nidx; + tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; - tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); + tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); if (threadIdx.x % raft::WarpSize == 0 && midx < m) { while (atomicCAS(workspace + midx, 0, 1) == 1) ; @@ -75,8 +75,15 @@ __global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, } template -void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, - int k, int *workspace, cudaStream_t stream) { +void naive(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + cudaStream_t stream) +{ static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); @@ -86,8 +93,7 @@ void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, <<>>(min, m, std::numeric_limits::max(), op); CUDA_CHECK(cudaGetLastError()); naiveKernel, 16> - <<>>(min, x, y, m, n, k, workspace, - std::numeric_limits::max()); + <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); CUDA_CHECK(cudaGetLastError()); } @@ -110,10 +116,13 @@ class FusedL2NNTest : public ::testing::TestWithParam> { yn(params.n, stream), min(params.m, stream), min_ref(params.m, stream), - workspace(params.m * sizeof(int), stream) {} + workspace(params.m * sizeof(int), stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int m = params.m; int n = params.n; @@ -121,10 +130,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { r.uniform(x.data(), m * k, DataT(-1.0), DataT(1.0), stream); r.uniform(y.data(), n * k, DataT(-1.0), DataT(1.0), stream); generateGoldenResult(); - raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, - stream); - raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, - stream); + raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream); + raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -140,23 +147,34 @@ class FusedL2NNTest : public ::testing::TestWithParam> { raft::handle_t handle; cudaStream_t stream; - virtual void generateGoldenResult() { + virtual void generateGoldenResult() + { int m = params.m; int n = params.n; int k = params.k; - naive(min_ref.data(), x.data(), y.data(), m, n, k, - (int *)workspace.data(), stream); + naive(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream); } - void runTest(cub::KeyValuePair *out) { + void runTest(cub::KeyValuePair* out) + { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>( - out, x.data(), y.data(), xn.data(), yn.data(), m, n, k, - (void *)workspace.data(), redOp, - raft::distance::KVPMinReduce(), Sqrt, true, stream); + fusedL2NN, int>(out, + x.data(), + y.data(), + xn.data(), + yn.data(), + m, + n, + k, + (void*)workspace.data(), + redOp, + raft::distance::KVPMinReduce(), + Sqrt, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } }; @@ -165,9 +183,10 @@ template struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} - bool operator()(const KVP &a, const KVP &b) const { - T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); - T m = std::max(raft::abs(a.value), raft::abs(b.value)); + bool operator()(const KVP& a, const KVP& b) const + { + T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); + T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; return (ratio <= eps); } @@ -179,17 +198,20 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; - bool operator()(const KVP &a, const KVP &b) const { + bool operator()(const KVP& a, const KVP& b) const + { if (a.value != b.value) return false; return true; } }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair *expected, - const cub::KeyValuePair *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, + const cub::KeyValuePair* actual, + size_t size, + L eq_compare, + cudaStream_t stream = 0) +{ typedef typename cub::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); @@ -201,47 +223,44 @@ template auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { return ::testing::AssertionFailure() - << "actual=" << act.key << "," << act.value - << " != expected=" << exp.key << "," << exp.value << " @" << i; + << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << "," + << exp.value << " @" << i; } } return ::testing::AssertionSuccess(); } const std::vector> inputsf = { - {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, - {0.001f, 64, 32, 32, 1234ULL}, {0.001f, 64, 64, 32, 1234ULL}, - {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, + {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, {0.001f, 64, 32, 32, 1234ULL}, + {0.001f, 64, 64, 32, 1234ULL}, {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL}, - {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, - {0.001f, 64, 32, 34, 1234ULL}, {0.001f, 64, 64, 34, 1234ULL}, - {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, + {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, {0.001f, 64, 32, 34, 1234ULL}, + {0.001f, 64, 64, 34, 1234ULL}, {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL}, - {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, - {0.001f, 64, 32, 33, 1234ULL}, {0.001f, 64, 64, 33, 1234ULL}, - {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, + {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, {0.001f, 64, 32, 33, 1234ULL}, + {0.001f, 64, 64, 33, 1234ULL}, {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL}, {0.006f, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestF_Sq; -TEST_P(FusedL2NNTestF_Sq, Result) { +TEST_P(FusedL2NNTestF_Sq, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNTest FusedL2NNTestF_Sqrt; -TEST_P(FusedL2NNTestF_Sqrt, Result) { +TEST_P(FusedL2NNTestF_Sqrt, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.00001, 32, 32, 32, 1234ULL}, {0.00001, 32, 64, 32, 1234ULL}, @@ -262,21 +281,21 @@ const std::vector> inputsd = { {0.00001, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestD_Sq; -TEST_P(FusedL2NNTestD_Sq, Result) { +TEST_P(FusedL2NNTestD_Sq, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNTest FusedL2NNTestD_Sqrt; -TEST_P(FusedL2NNTestD_Sqrt, Result) { +TEST_P(FusedL2NNTestD_Sqrt, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd)); /// This is to test output determinism of the prim template @@ -284,7 +303,8 @@ class FusedL2NNDetTest : public FusedL2NNTest { public: FusedL2NNDetTest() : stream(handle.get_stream()), min1(0, stream) {} - void SetUp() override { + void SetUp() override + { FusedL2NNTest::SetUp(); int m = this->params.m; min1.resize(m, stream); @@ -305,50 +325,46 @@ class FusedL2NNDetTest : public FusedL2NNTest { }; typedef FusedL2NNDetTest FusedL2NNDetTestF_Sq; -TEST_P(FusedL2NNDetTestF_Sq, Result) { +TEST_P(FusedL2NNDetTestF_Sq, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE( - devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestF_Sqrt; -TEST_P(FusedL2NNDetTestF_Sqrt, Result) { +TEST_P(FusedL2NNDetTestF_Sqrt, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE( - devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sq; -TEST_P(FusedL2NNDetTestD_Sq, Result) { +TEST_P(FusedL2NNDetTestD_Sq, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, - CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sqrt; -TEST_P(FusedL2NNDetTestD_Sqrt, Result) { +TEST_P(FusedL2NNDetTestD_Sqrt, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, - CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index ede790b38c..dc7de92eb8 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -25,7 +25,8 @@ namespace raft { -TEST(Raft, EigenSolvers) { +TEST(Raft, EigenSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -36,7 +37,7 @@ TEST(Raft, EigenSolvers) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; @@ -48,7 +49,7 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; @@ -59,14 +60,13 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); - EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } -TEST(Raft, SpectralSolvers) { +TEST(Raft, SpectralSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -80,7 +80,7 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -94,19 +94,16 @@ TEST(Raft, SpectralSolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver, - clusters, eigvals, eigvecs)); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW( - spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); + EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 3e27789078..698a601e85 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -22,7 +22,8 @@ namespace raft { -TEST(Raft, HandleDefault) { +TEST(Raft, HandleDefault) +{ handle_t h; ASSERT_EQ(0, h.get_device()); ASSERT_EQ(nullptr, h.get_stream()); @@ -32,7 +33,8 @@ TEST(Raft, HandleDefault) { ASSERT_NE(nullptr, h.get_cusparse_handle()); } -TEST(Raft, Handle) { +TEST(Raft, Handle) +{ handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; @@ -43,13 +45,15 @@ TEST(Raft, Handle) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, GetInternalStreams) { +TEST(Raft, GetInternalStreams) +{ handle_t h(4); auto streams = h.get_internal_streams(); ASSERT_EQ(4U, streams.size()); } -TEST(Raft, GetHandleFromPool) { +TEST(Raft, GetHandleFromPool) +{ handle_t parent(4); handle_t child(parent, 2); @@ -62,13 +66,13 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleStreamViews) { +TEST(Raft, GetHandleStreamViews) +{ handle_t parent(4); handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); - ASSERT_EQ(parent.get_internal_stream_view(2).value(), - child.get_stream_view().value()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); EXPECT_FALSE(child.get_stream_view().is_default()); } } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp index 830d085a40..d883de59fe 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/integer_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, rounding_up) { +TEST(Raft, rounding_up) +{ ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); @@ -29,7 +30,8 @@ TEST(Raft, rounding_up) { ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); } -TEST(Raft, is_a_power_of_two) { +TEST(Raft, is_a_power_of_two) +{ ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); } diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index f79d8f10c8..d983ec1162 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -35,7 +35,8 @@ class labelTest : public ::testing::Test { }; typedef labelTest MakeMonotonicTest; -TEST_F(MakeMonotonicTest, Result) { +TEST_F(MakeMonotonicTest, Result) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -45,11 +46,9 @@ TEST_F(MakeMonotonicTest, Result) { rmm::device_uvector actual(m, stream); rmm::device_uvector expected(m, stream); - float *data_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; + float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; - float *expected_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; + float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; raft::update_device(data.data(), data_h, m, stream); raft::update_device(expected.data(), expected_h, m, stream); @@ -58,14 +57,14 @@ TEST_F(MakeMonotonicTest, Result) { CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, - raft::Compare(), stream)); + ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare(), stream)); delete data_h; delete expected_h; } -TEST(labelTest, Classlabels) { +TEST(labelTest, Classlabels) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -81,17 +80,16 @@ TEST(labelTest, Classlabels) { ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare(), stream)); rmm::device_uvector y_relabeled_d(n_rows, stream); - getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, - y_relabeled_d.data(), 2, stream); + getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, y_relabeled_d.data(), 2, stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; - EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, raft::Compare(), stream)); } }; // namespace label }; // namespace raft diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index 76e0a4295e..dd67f0fd89 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -39,8 +39,7 @@ struct MergeLabelsInputs { }; template -class MergeLabelsTest - : public ::testing::TestWithParam> { +class MergeLabelsTest : public ::testing::TestWithParam> { protected: MergeLabelsTest() : params(::testing::TestWithParam>::GetParam()), @@ -50,25 +49,23 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(stream) {} - - void Run() { - raft::update_device(labels_a.data(), params.labels_a.data(), params.N, - stream); - raft::update_device(labels_b.data(), params.labels_b.data(), params.N, - stream); - raft::update_device(expected.data(), params.expected.data(), params.N, - stream); - raft::update_device(mask.data(), - reinterpret_cast(params.mask.data()), params.N, - stream); - - merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(), - m.data(), params.N, stream); + m(stream) + { + } + + void Run() + { + raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream); + raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream); + raft::update_device(expected.data(), params.expected.data(), params.N, stream); + raft::update_device(mask.data(), reinterpret_cast(params.mask.data()), params.N, stream); + + merge_labels( + labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream); cudaStreamSynchronize(stream); - ASSERT_TRUE(raft::devArrMatch(expected.data(), labels_a.data(), - params.N, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + expected.data(), labels_a.data(), params.N, raft::Compare())); } protected: @@ -86,22 +83,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); } using MergeLabelsTestL = MergeLabelsTest; TEST_P(MergeLabelsTestL, Result) { Run(); } -constexpr int MAX32 = std::numeric_limits::max(); +constexpr int MAX32 = std::numeric_limits::max(); constexpr int64_t MAX64 = std::numeric_limits::max(); const std::vector> merge_inputs_32 = { {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX32}, - {1, 2, MAX32, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX32, 1, 3, MAX32}, {1, 2, 3, 2, MAX32, 2, 2, 2}, @@ -117,16 +106,8 @@ const std::vector> merge_inputs_32 = { const std::vector> merge_inputs_64 = { {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX64}, - {1, 2, MAX64, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX64, 1, 3, MAX64}, {1, 2, 3, 2, MAX64, 2, 2, 2}, @@ -139,10 +120,8 @@ const std::vector> merge_inputs_64 = { {1, 1, 1, 1, 1, 7, 7, 7}}, }; -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, - ::testing::ValuesIn(merge_inputs_32)); -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, - ::testing::ValuesIn(merge_inputs_64)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64)); } // namespace label } // namespace raft diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 08429e18f2..183c0bd2f3 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -31,11 +31,11 @@ #include #include -#define PROBLEMSIZE 1000 // Number of rows/columns -#define BATCHSIZE 10 // Number of problems in the batch -#define COSTRANGE 1000 +#define PROBLEMSIZE 1000 // Number of rows/columns +#define BATCHSIZE 10 // Number of problems in the batch +#define COSTRANGE 1000 #define PROBLEMCOUNT 1 -#define REPETITIONS 1 +#define REPETITIONS 1 #define SEED 01010001 @@ -45,38 +45,41 @@ namespace raft { // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE]. template -void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) { +void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange) +{ long N2 = SP * N * N; std::uniform_int_distribution distribution(0, costrange); for (long i = 0; i < N2; i++) { - int val = distribution(generator); + int val = distribution(generator); cost_matrix[i] = (weight_t)val; } } template -void hungarian_test(int problemsize, int costrange, int problemcount, - int repetitions, int batchsize, weight_t epsilon, - bool verbose = false) { +void hungarian_test(int problemsize, + int costrange, + int problemcount, + int repetitions, + int batchsize, + weight_t epsilon, + bool verbose = false) +{ raft::handle_t handle; - weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize]; + weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize]; for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); - rmm::device_uvector elements_v( - batchsize * problemsize * problemsize, handle.get_stream()); - rmm::device_uvector row_assignment_v(batchsize * problemsize, - handle.get_stream()); - rmm::device_uvector col_assignment_v(batchsize * problemsize, - handle.get_stream()); + rmm::device_uvector elements_v(batchsize * problemsize * problemsize, + handle.get_stream()); + rmm::device_uvector row_assignment_v(batchsize * problemsize, handle.get_stream()); + rmm::device_uvector col_assignment_v(batchsize * problemsize, handle.get_stream()); - raft::update_device(elements_v.data(), h_cost, - batchsize * problemsize * problemsize, - handle.get_stream()); + raft::update_device( + elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream()); for (int i = 0; i < repetitions; i++) { float start = omp_get_wtime(); @@ -86,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount, handle, problemsize, batchsize, epsilon); // Solve LAP(s) for given cost matrix - lpx.solve(elements_v.data(), row_assignment_v.data(), - col_assignment_v.data()); + lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data()); float end = omp_get_wtime(); float total_time = (end - start); if (verbose) { - // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match. + // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual + // objectives. At optimality both values should match. for (int k = 0; k < batchsize; k++) { - std::cout << j << ":" << i << ":" << k << ":" - << lpx.getPrimalObjectiveValue(k) << ":" - << lpx.getDualObjectiveValue(k) << ":" << total_time - << std::endl; + std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":" + << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl; } } } @@ -108,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount, delete[] h_cost; } -TEST(Raft, HungarianIntFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianIntFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianIntDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianIntDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianIntLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianIntLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } -TEST(Raft, HungarianLongFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianLongFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianLongDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, - REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianLongDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianLongLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianLongLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } } // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 48ad83dfd2..17b000044e 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -33,10 +33,13 @@ class AddTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -47,9 +50,10 @@ class AddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); + void compare() + { + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } protected: diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 137419758f..1d9352bfc1 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -23,18 +23,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, - int len) { +__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = OutT(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); } } template -void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { +void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -47,8 +46,8 @@ struct AddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const AddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const AddInputs& dims) +{ return os; } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index c8121bfbe4..c833faa0b2 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -29,28 +29,29 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void binaryOpLaunch( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -class BinaryOpTest - : public ::testing::TestWithParam> { +class BinaryOpTest : public ::testing::TestWithParam> { public: BinaryOpTest() - : params(::testing::TestWithParam< - BinaryOpInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType len = params.len; r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream); @@ -71,67 +72,66 @@ class BinaryOpTest rmm::device_uvector out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32; -TEST_P(BinaryOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i64; -TEST_P(BinaryOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; -TEST_P(BinaryOpTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32_D, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; -TEST_P(BinaryOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i64; -TEST_P(BinaryOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); template class BinaryOpAlignment : public ::testing::Test { protected: - BinaryOpAlignment() { + BinaryOpAlignment() + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void Misaligned() { + void Misaligned() + { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; @@ -141,8 +141,12 @@ class BinaryOpAlignment : public ::testing::Test { CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( - z.data() + 9, x.data() + 137, y.data() + 19, 256, - [] __device__(math_t x, math_t y) { return x + y; }, stream); + z.data() + 9, + x.data() + 137, + y.data() + 19, + 256, + [] __device__(math_t x, math_t y) { return x + y; }, + stream); } raft::handle_t handle; diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index fd8ed6dd1e..97cb3ecb24 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -24,18 +24,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len) { +__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); - if (idx < len) { - out[idx] = static_cast(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = static_cast(in1[idx] + in2[idx]); } } template -void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { +void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ static const IdxType TPB = 64; - IdxType nblks = raft::ceildiv(len, TPB); + IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -48,8 +47,8 @@ struct BinaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const BinaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) +{ return os; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 262a1ad26c..6c7bbd1232 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test { L(n_rows * n_rows, handle.get_stream()), L_exp(n_rows * n_rows, handle.get_stream()), devInfo(handle.get_stream()), - workspace(0, handle.get_stream()) { + workspace(0, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test { int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace // requirements. - raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr, - &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes); workspace.resize(Lwork, stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testR1Update() { + void testR1Update() + { int n = n_rows * n_rows; - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), n, stream); for (int rank = 1; rank <= n_rows; rank++) { std::stringstream ss; - ss << "Rank " << rank - << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); + ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); SCOPED_TRACE(ss.str()); // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf( - solver_handle, uplo, rank, L_exp.data(), n_rows, - (math_t*)workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + stream)); // Incremental Cholesky factorization using rank one updates. - raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows, - workspace.data(), &Lwork, uplo, - stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream); - ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank, - raft::CompareApprox(3e-3))); + ASSERT_TRUE(raft::devArrMatch( + L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox(3e-3))); } } } - void testR1Error() { + void testR1Error() + { raft::update_device(G.data(), G2_host, 4, stream); - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), 4, stream); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream)); - ASSERT_THROW( - raft::linalg::choleskyRank1Update( - handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), - raft::exception); + ASSERT_THROW(raft::linalg::choleskyRank1Update( + handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), + raft::exception); math_t eps = std::numeric_limits::epsilon(); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index fdfc3052b7..9bb84e1eb7 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -33,8 +33,8 @@ struct coalescedReductionInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const coalescedReductionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) +{ return os; } @@ -42,25 +42,28 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream, bool inplace = false) { - coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, - [] __device__(T in, int i) { return in * in; }); +void coalescedReductionLaunch( + T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false) +{ + coalescedReduction( + dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; }); } template -class coalescedReductionTest - : public ::testing::TestWithParam> { +class coalescedReductionTest : public ::testing::TestWithParam> { public: coalescedReductionTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.rows * params.cols, stream), - dots_act(params.rows * params.cols, stream) {} + dots_act(params.rows * params.cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; @@ -70,8 +73,7 @@ class coalescedReductionTest // Perform reduction with default inplace = false first coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream); // Add to result with inplace = true next - coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, - true); + coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -86,34 +88,36 @@ class coalescedReductionTest rmm::device_uvector dots_act; }; -const std::vector> inputsf = { - {0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; -TEST_P(coalescedReductionTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } typedef coalescedReductionTest coalescedReductionTestD; -TEST_P(coalescedReductionTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index d90955147c..130a22abf0 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -25,37 +25,36 @@ namespace raft { namespace linalg { template -__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] / scalar; - } + if (idx < len) { out[idx] = in[idx] / scalar; } } template -void naiveDivide(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveDivideKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } template -class DivideTest - : public ::testing::TestWithParam> { +class DivideTest : public ::testing::TestWithParam> { public: DivideTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -75,25 +74,23 @@ class DivideTest rmm::device_uvector out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef DivideTest DivideTestF; -TEST_P(DivideTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(DivideTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf)); typedef DivideTest DivideTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(DivideTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 2ac9118506..3df3abd2af 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -35,7 +35,8 @@ struct EigInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigInputs& dims) +{ return os; } @@ -56,34 +57,60 @@ class EigTest : public ::testing::TestWithParam> { eig_vectors_large(params.n * params.n, stream), eig_vectors_jacobi_large(params.n * params.n, stream), eig_vals_large(params.n, stream), - eig_vals_jacobi_large(params.n, stream) {} + eig_vals_jacobi_large(params.n, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream); - T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, - 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, - 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + T eig_vectors_ref_h[] = {0.2790, + -0.6498, + 0.6498, + -0.2789, + -0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, len, stream); - raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, - stream); + raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, stream); - eigDC(handle, cov_matrix.data(), params.n_row, params.n_col, - eig_vectors.data(), eig_vals.data(), stream); + eigDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + eig_vectors.data(), + eig_vals.data(), + stream); - T tol = 1.e-7; + T tol = 1.e-7; int sweeps = 15; - eigJacobi(handle, cov_matrix.data(), params.n_row, params.n_col, - eig_vectors_jacobi.data(), eig_vals_jacobi.data(), stream, tol, + eigJacobi(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + eig_vectors_jacobi.data(), + eig_vals_jacobi.data(), + stream, + tol, sweeps); // test code for comparing two methods @@ -91,11 +118,22 @@ class EigTest : public ::testing::TestWithParam> { r.uniform(cov_matrix_large.data(), len, T(-1.0), T(1.0), stream); - eigDC(handle, cov_matrix_large.data(), params.n, params.n, - eig_vectors_large.data(), eig_vals_large.data(), stream); - eigJacobi(handle, cov_matrix_large.data(), params.n, params.n, - eig_vectors_jacobi_large.data(), eig_vals_jacobi_large.data(), - stream, tol, sweeps); + eigDC(handle, + cov_matrix_large.data(), + params.n, + params.n, + eig_vectors_large.data(), + eig_vals_large.data(), + stream); + eigJacobi(handle, + cov_matrix_large.data(), + params.n, + params.n, + eig_vectors_jacobi_large.data(), + eig_vals_jacobi_large.data(), + stream, + tol, + sweeps); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -105,87 +143,105 @@ class EigTest : public ::testing::TestWithParam> { EigInputs params; - rmm::device_uvector cov_matrix, eig_vectors, eig_vectors_jacobi, - eig_vectors_ref, eig_vals, eig_vals_jacobi, eig_vals_ref; + rmm::device_uvector cov_matrix, eig_vectors, eig_vectors_jacobi, eig_vectors_ref, eig_vals, + eig_vals_jacobi, eig_vals_ref; - rmm::device_uvector cov_matrix_large, eig_vectors_large, - eig_vectors_jacobi_large, eig_vals_large, eig_vals_jacobi_large; + rmm::device_uvector cov_matrix_large, eig_vectors_large, eig_vectors_jacobi_large, + eig_vals_large, eig_vals_jacobi_large; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigTest EigTestValF; -TEST_P(EigTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValD; -TEST_P(EigTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecF; -TEST_P(EigTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecD; -TEST_P(EigTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiF; -TEST_P(EigTestValJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals_jacobi.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiD; -TEST_P(EigTestValJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals_jacobi.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiF; -TEST_P(EigTestVecJacobiF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors_jacobi.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiD; -TEST_P(EigTestVecJacobiD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors_jacobi.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareF; -TEST_P(EigTestVecCompareF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large.data(), eig_vectors_jacobi_large.data(), - (params.n * params.n), raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(), + eig_vectors_jacobi_large.data(), + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareD; -TEST_P(EigTestVecCompareD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large.data(), eig_vectors_jacobi_large.data(), - (params.n * params.n), raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(), + eig_vectors_jacobi_large.data(), + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); @@ -196,17 +252,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 9eb1c10313..b1e88c91dd 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -37,7 +37,8 @@ struct EigSelInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigSelInputs& dims) +{ return os; } @@ -51,27 +52,46 @@ class EigSelTest : public ::testing::TestWithParam> { eig_vectors(12, stream), eig_vectors_ref(12, stream), eig_vals(params.n_col, stream), - eig_vals_ref(params.n_col, stream) {} + eig_vals_ref(params.n_col, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = params.len; - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream); - T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, - -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + T eig_vectors_ref_h[] = {-0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream); - eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3, - eig_vectors.data(), eig_vals.data(), - EigVecMemUsage::OVERWRITE_INPUT, stream); + eigSelDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + 3, + eig_vectors.data(), + eig_vals.data(), + EigVecMemUsage::OVERWRITE_INPUT, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -87,51 +107,53 @@ class EigSelTest : public ::testing::TestWithParam> { rmm::device_uvector eig_vals_ref; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigSelTest EigSelTestValF; -TEST_P(EigSelTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestValD; -TEST_P(EigSelTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecF; -TEST_P(EigSelTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + 12, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecD; -TEST_P(EigSelTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + 12, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index c3b26f5423..5ecca16be6 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -26,19 +26,17 @@ namespace linalg { //// Testing unary ops template -__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = scalar * in[idx]; - } + if (idx < len) { out[idx] = scalar * in[idx]; } } template -void naiveScale(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -52,26 +50,28 @@ struct ScalarMultiplyInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ScalarMultiplyInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs& dims) +{ return os; } template -class ScalarMultiplyTest - : public ::testing::TestWithParam> { +class ScalarMultiplyTest : public ::testing::TestWithParam> { public: ScalarMultiplyTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(len, stream), out_ref(len, stream), - out(len, stream) {} + out(len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; T scalar = params.scalar; r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); @@ -87,46 +87,43 @@ class ScalarMultiplyTest rmm::device_uvector in, out_ref, out; }; -const std::vector> inputsf1 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; const std::vector> inputsd1 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef ScalarMultiplyTest ScalarMultiplyTestF; -TEST_P(ScalarMultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } typedef ScalarMultiplyTest ScalarMultiplyTestD; -TEST_P(ScalarMultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, - ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1)); -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, - ::testing::ValuesIn(inputsd1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1)); //// Testing binary ops template -__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, - int len) { +__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] + in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] + in2[idx]; } } template -void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -139,8 +136,8 @@ struct EltwiseAddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const EltwiseAddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs& dims) +{ return os; } @@ -153,10 +150,13 @@ class EltwiseAddTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -175,29 +175,27 @@ class EltwiseAddTest : public ::testing::TestWithParam> { rmm::device_uvector in1, in2, out_ref, out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef EltwiseAddTest EltwiseAddTestF; -TEST_P(EltwiseAddTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } typedef EltwiseAddTest EltwiseAddTestD; -TEST_P(EltwiseAddTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index 699d40d55e..6231715c8a 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -36,9 +36,9 @@ struct GemmLayoutInputs { // Reference GEMM implementation. template -__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, - bool isZColMajor, bool isXColMajor, - bool isYColMajor) { +__global__ void naiveGemm( + T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor) +{ int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; @@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, temp += X[xIndex] * Y[yIndex]; } int zIndex = isZColMajor ? m + n * M : m * N + n; - Z[zIndex] = temp; + Z[zIndex] = temp; } } } @@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, template class GemmLayoutTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { // Dimensions of Y : K x N // Dimensions of Z : M x N - T *X = NULL; // Argument X - T *Y = NULL; // Argument Y + T* X = NULL; // Argument X + T* Y = NULL; // Argument Y size_t xElems = params.M * params.K; size_t yElems = params.K * params.N; @@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam> { r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); - dim3 blocks(raft::ceildiv(params.M, 128), - raft::ceildiv(params.N, 4), 1); + dim3 blocks(raft::ceildiv(params.M, 128), raft::ceildiv(params.N, 4), 1); dim3 threads(128, 4, 1); - naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, - params.zLayout, params.xLayout, - params.yLayout); - - gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, - params.xLayout, params.yLayout, stream); + naiveGemm<<>>( + refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout); + + gemm(handle, + Z, + X, + Y, + params.M, + params.N, + params.K, + params.zLayout, + params.xLayout, + params.yLayout, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(refZ)); CUDA_CHECK(cudaFree(Z)); } protected: GemmLayoutInputs params; - T *refZ = NULL; // Reference result for comparison - T *Z = NULL; // Computed result + T* refZ = NULL; // Reference result for comparison + T* Z = NULL; // Computed result }; const std::vector> inputsf = { @@ -131,22 +140,20 @@ const std::vector> inputsd = { {50, 80, 60, false, false, false, 893038ULL}}; typedef GemmLayoutTest GemmLayoutTestF; -TEST_P(GemmLayoutTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-4))); +TEST_P(GemmLayoutTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); } typedef GemmLayoutTest GemmLayoutTestD; -TEST_P(GemmLayoutTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-6))); +TEST_P(GemmLayoutTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-6))); } -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu index 92e59ae49b..4d5472f38c 100644 --- a/cpp/test/linalg/gemv.cu +++ b/cpp/test/linalg/gemv.cu @@ -34,10 +34,16 @@ struct GemvInputs { // Reference GEMV implementation. template -__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows, - const int n_cols, const int lda, const bool trans_a) { +__global__ void naiveGemv(T* y, + const T* A, + const T* x, + const int n_rows, + const int n_cols, + const int lda, + const bool trans_a) +{ int istart = blockIdx.x * blockDim.x + threadIdx.x; - int istep = blockDim.x * gridDim.x; + int istep = blockDim.x * gridDim.x; if (!trans_a) { for (int i = istart; i < n_rows; i += istep) { @@ -69,12 +75,14 @@ class GemvTest : public ::testing::TestWithParam> { GemvTest() : testing::TestWithParam>(), refy(0, rmm::cuda_stream_default), - y(0, rmm::cuda_stream_default) { + y(0, rmm::cuda_stream_default) + { rmm::cuda_stream_default.synchronize(); } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -98,39 +106,55 @@ class GemvTest : public ::testing::TestWithParam> { dim3 blocks(raft::ceildiv(yElems, 256), 1, 1); dim3 threads(256, 1, 1); - naiveGemv<<>>(refy.data(), A.data(), x.data(), - params.n_rows, params.n_cols, params.lda, - params.trans_a); - - gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(), - y.data(), params.trans_a, stream); + naiveGemv<<>>( + refy.data(), A.data(), x.data(), params.n_rows, params.n_cols, params.lda, params.trans_a); + + gemv(handle, + A.data(), + params.n_rows, + params.n_cols, + params.lda, + x.data(), + y.data(), + params.trans_a, + stream); } void TearDown() override {} }; -const std::vector> inputsf = { - {80, 70, 80, true, 76433ULL}, {80, 100, 80, true, 426646ULL}, - {20, 100, 20, true, 37703ULL}, {100, 60, 200, true, 538004ULL}, - {50, 10, 60, false, 73012ULL}, {90, 90, 90, false, 538147ULL}, - {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}}; - -const std::vector> inputsd = { - {10, 70, 10, true, 535648ULL}, {30, 30, 30, true, 956681ULL}, - {70, 80, 70, true, 875083ULL}, {80, 90, 200, true, 50744ULL}, - {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL}, - {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}}; +const std::vector> inputsf = {{80, 70, 80, true, 76433ULL}, + {80, 100, 80, true, 426646ULL}, + {20, 100, 20, true, 37703ULL}, + {100, 60, 200, true, 538004ULL}, + {50, 10, 60, false, 73012ULL}, + {90, 90, 90, false, 538147ULL}, + {30, 100, 30, false, 412352ULL}, + {40, 80, 100, false, 297941ULL}}; + +const std::vector> inputsd = {{10, 70, 10, true, 535648ULL}, + {30, 30, 30, true, 956681ULL}, + {70, 80, 70, true, 875083ULL}, + {80, 90, 200, true, 50744ULL}, + {90, 90, 90, false, 506321ULL}, + {40, 100, 70, false, 638418ULL}, + {80, 50, 80, false, 701529ULL}, + {50, 80, 60, false, 893038ULL}}; typedef GemvTest GemvTestF; -TEST_P(GemvTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), +TEST_P(GemvTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refy.data(), + y.data(), params.trans_a ? params.n_cols : params.n_rows, raft::CompareApprox(1e-4))); } typedef GemvTest GemvTestD; -TEST_P(GemvTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), +TEST_P(GemvTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refy.data(), + y.data(), params.trans_a ? params.n_cols : params.n_rows, raft::CompareApprox(1e-6))); } diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index f04c225aa9..787d9ba415 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -25,13 +25,22 @@ namespace raft { namespace linalg { template -void mapLaunch(OutType *out, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void mapLaunch(OutType* out, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ map( - out, len, + out, + len, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - stream, in1, in2, in3); + stream, + in1, + in2, + in3); } template @@ -43,9 +52,14 @@ struct MapInputs { }; template -void create_ref(OutType *out_ref, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void create_ref(OutType* out_ref, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ rmm::device_uvector tmp(len, stream); eltwiseAdd(tmp.data(), in1, in2, len, stream); eltwiseAdd(out_ref, tmp.data(), in3, len, stream); @@ -54,21 +68,22 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, } template -class MapTest - : public ::testing::TestWithParam> { +class MapTest : public ::testing::TestWithParam> { public: MapTest() - : params(::testing::TestWithParam< - MapInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in1(params.len, stream), in2(params.len, stream), in3(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType len = params.len; @@ -76,10 +91,8 @@ class MapTest r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream); r.uniform(in3.data(), len, InType(-1.0), InType(1.0), stream); - create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), - params.scalar, len, stream); - mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, - len, stream); + create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream); + mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -92,55 +105,52 @@ class MapTest rmm::device_uvector out_ref, out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL, 3.2}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}}; typedef MapTest MapTestF_i32; -TEST_P(MapTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL, 9.4}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}}; typedef MapTest MapTestF_i64; -TEST_P(MapTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL, 5.9}}; typedef MapTest MapTestF_i32_D; -TEST_P(MapTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32_D, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL, 7.5}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}}; typedef MapTest MapTestD_i32; -TEST_P(MapTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL, 5.2}}; typedef MapTest MapTestD_i64; -TEST_P(MapTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 9d59e49e60..1594cc3544 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -27,21 +27,18 @@ namespace raft { namespace linalg { template -__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, - MapOp map) { +__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - raft::myAtomicAdd(out, (OutType)map(in[idx])); - } + if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); } } template -void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, - cudaStream_t stream) { +void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel - <<>>(out, in, len, map); + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel<<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -53,7 +50,8 @@ struct MapReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs& dims) +{ return os; } @@ -61,8 +59,9 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, - size_t len, cudaStream_t stream) { +void mapReduceLaunch( + OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream) +{ auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); @@ -78,10 +77,12 @@ class MapReduceTest : public ::testing::TestWithParam> { out_ref(params.len, stream), out(params.len, stream) - {} + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = params.len; r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream); @@ -98,42 +99,40 @@ class MapReduceTest : public ::testing::TestWithParam> { rmm::device_uvector out_ref, out; }; -const std::vector> inputsf = { - {0.001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.001f, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestFF; -TEST_P(MapReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf)); typedef MapReduceTest MapReduceTestFD; -TEST_P(MapReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); -const std::vector> inputsd = { - {0.000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.000001, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestDD; -TEST_P(MapReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestDD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); template class MapGenericReduceTest : public ::testing::Test { - using InType = typename T::first_type; + using InType = typename T::first_type; using OutType = typename T::second_type; protected: - MapGenericReduceTest() - : input(n, handle.get_stream()), output(handle.get_stream()) { + MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -142,7 +141,8 @@ class MapGenericReduceTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void initInput(InType *input, int n, cudaStream_t stream) { + void initInput(InType* input, int n, cudaStream_t stream) + { raft::random::Rng r(137); r.uniform(input, n, InType(2), InType(3), stream); InType val = 1; @@ -151,21 +151,19 @@ class MapGenericReduceTest : public ::testing::Test { raft::update_device(input + 337, &val, 1, stream); } - void testMin() { - auto op = [] __device__(InType in) { return in; }; + void testMin() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::max(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare())); } - void testMax() { - auto op = [] __device__(InType in) { return in; }; + void testMax() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::min(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare())); } protected: @@ -178,8 +176,7 @@ class MapGenericReduceTest : public ::testing::Test { }; using IoTypePair = - ::testing::Types, std::pair, - std::pair>; + ::testing::Types, std::pair, std::pair>; TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aad1d1e137..3db7c53041 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -32,8 +32,8 @@ struct MatVecOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MatVecOpInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) +{ return os; } @@ -41,24 +41,45 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, bool useTwoVectors, - cudaStream_t stream) { +void matrixVectorOpLaunch(T* out, + const T* in, + const T* vec1, + const T* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + bool useTwoVectors, + cudaStream_t stream) +{ if (useTwoVectors) { matrixVectorOp( - out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b, T c) { return a + b + c; }, stream); + out, + in, + vec1, + vec2, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, + stream); } else { matrixVectorOp( - out, in, vec1, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b) { return a + b; }, stream); + out, + in, + vec1, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, + stream); } } template -class MatVecOpTest - : public ::testing::TestWithParam> { +class MatVecOpTest : public ::testing::TestWithParam> { public: MatVecOpTest() : params(::testing::TestWithParam>::GetParam()), @@ -67,27 +88,50 @@ class MatVecOpTest out_ref(params.rows * params.cols, stream), out(params.rows * params.cols, stream), vec1(params.bcastAlongRows ? params.cols : params.rows, stream), - vec2(params.bcastAlongRows ? params.cols : params.rows, stream) {} + vec2(params.bcastAlongRows ? params.cols : params.rows, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType N = params.rows, D = params.cols; - IdxType len = N * D; + IdxType len = N * D; IdxType vecLen = params.bcastAlongRows ? D : N; r.uniform(in.data(), len, (T)-1.0, (T)1.0, stream); r.uniform(vec1.data(), vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2.data(), vecLen, (T)-1.0, (T)1.0, stream); if (params.useTwoVectors) { - naiveMatVec(out_ref.data(), in.data(), vec1.data(), vec2.data(), D, N, - params.rowMajor, params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref.data(), + in.data(), + vec1.data(), + vec2.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + (T)1.0); } else { - naiveMatVec(out_ref.data(), in.data(), vec1.data(), D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref.data(), + in.data(), + vec1.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + (T)1.0); } - matrixVectorOpLaunch(out.data(), in.data(), vec1.data(), vec2.data(), D, N, - params.rowMajor, params.bcastAlongRows, - params.useTwoVectors, stream); + matrixVectorOpLaunch(out.data(), + in.data(), + vec1.data(), + vec2.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + params.useTwoVectors, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -118,23 +162,23 @@ const std::vector> inputsf_i32 = { {0.00001f, 1024, 32, false, false, true, 1234ULL}, {0.00001f, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.00001f, 2500, 250, false, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, true, true, false, 1234ULL}, @@ -155,23 +199,27 @@ const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, false, false, true, 1234ULL}, {0.0000001, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, +TEST_P(MatVecOpTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref.data(), + out.data(), + params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.0000001, 2500, 250, false, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, +TEST_P(MatVecOpTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref.data(), + out.data(), + params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 69c45c9866..5f9c6f1ef3 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -22,9 +22,15 @@ namespace raft { namespace linalg { template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, - IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, - bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index f78ae64f05..2a632d55b2 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -32,10 +32,13 @@ class MultiplyTest : public ::testing::TestWithParam> { stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -53,25 +56,23 @@ class MultiplyTest : public ::testing::TestWithParam> { rmm::device_uvector in, out_ref, out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef MultiplyTest MultiplyTestF; -TEST_P(MultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(MultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf)); typedef MultiplyTest MultiplyTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(MultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 659956534e..6dae606f18 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -34,17 +34,19 @@ struct NormInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { - os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " - << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; +::std::ostream& operator<<(::std::ostream& os, const NormInputs& I) +{ + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", " + << I.do_sqrt << ", " << I.seed << '}' << std::endl; return os; } ///// Row-wise norm test definitions template -__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { - Type acc = (Type)0; +__global__ void naiveRowNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ + Type acc = (Type)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { @@ -59,12 +61,12 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveRowNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveRowNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } @@ -76,21 +78,22 @@ class RowNormTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.rows, stream), - dots_act(params.rows, stream) {} + dots_act(params.rows, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols, len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), stream); - naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, - params.do_sqrt, stream); + naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - rowNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream, fin_op); + rowNorm( + dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op); } else { - rowNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream); + rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -105,10 +108,11 @@ class RowNormTest : public ::testing::TestWithParam> { ///// Column-wise norm test definitisons template -__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { +__global__ void naiveColNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ int colID = threadIdx.x + blockIdx.x * blockDim.x; - if (colID > D) return; //avoid out-of-bounds thread + if (colID > D) return; // avoid out-of-bounds thread Type acc = 0; for (int i = 0; i < N; i++) { @@ -120,12 +124,12 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveColNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(D, TPB); - naiveColNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } @@ -137,22 +141,23 @@ class ColNormTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.cols, stream), - dots_act(params.cols, stream) {} + dots_act(params.cols, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols, len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), stream); - naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, - params.do_sqrt, stream); + naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - colNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream, fin_op); + colNorm( + dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op); } else { - colNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream); + colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -166,24 +171,23 @@ class ColNormTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = { - {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, - - {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; const std::vector> inputsd = { {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, @@ -205,22 +209,22 @@ const std::vector> inputsd = { {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; typedef RowNormTest RowNormTestF; -TEST_P(RowNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } typedef RowNormTest RowNormTestD; -TEST_P(RowNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd)); const std::vector> inputscf = { {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, @@ -261,22 +265,22 @@ const std::vector> inputscd = { {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; typedef ColNormTest ColNormTestF; -TEST_P(ColNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef ColNormTest ColNormTestD; -TEST_P(ColNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, - ::testing::ValuesIn(inputscf)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf)); -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, - ::testing::ValuesIn(inputscd)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9822ca2c60..25ee0a7b77 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -34,8 +34,8 @@ struct ReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ReduceInputs& dims) +{ return os; } @@ -43,44 +43,58 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, - bool rowMajor, bool alongRows, bool inplace, - cudaStream_t stream) { - reduce( - dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, - [] __device__(InType in, int i) { return static_cast(in * in); }); +void reduceLaunch(OutType* dots, + const InType* data, + int cols, + int rows, + bool rowMajor, + bool alongRows, + bool inplace, + cudaStream_t stream) +{ + reduce(dots, + data, + cols, + rows, + (OutType)0, + rowMajor, + alongRows, + stream, + inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } template -class ReduceTest - : public ::testing::TestWithParam> { +class ReduceTest : public ::testing::TestWithParam> { public: ReduceTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.alongRows ? params.rows : params.cols, stream), - dots_act(params.alongRows ? params.rows : params.cols, stream) {} + dots_act(params.alongRows ? params.rows : params.cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - outlen = params.alongRows ? rows : cols; + outlen = params.alongRows ? rows : cols; r.uniform(data.data(), len, InType(-1.0), InType(1.0), stream); - naiveReduction(dots_exp.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, stream); + naiveReduction( + dots_exp.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, stream); // Perform reduction with default inplace = false first - reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, false, stream); + reduceLaunch( + dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, false, stream); // Add to result with inplace = true next, which shouldn't affect // in the case of coalescedReduction! if (!(params.rowMajor ^ params.alongRows)) { - reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, true, stream); + reduceLaunch( + dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -150,31 +164,31 @@ const std::vector> inputsfd = { {0.000002f, 1024, 256, false, false, 1234ULL}}; typedef ReduceTest ReduceTestFF; -TEST_P(ReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFF, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestDD; -TEST_P(ReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestDD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestFD; -TEST_P(ReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, - ::testing::ValuesIn(inputsff)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, - ::testing::ValuesIn(inputsdd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, - ::testing::ValuesIn(inputsfd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 7f8319636b..82ddfd4661 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -26,55 +26,60 @@ namespace raft { namespace linalg { template -__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, - int D, int N) { - OutType acc = (OutType)0; +__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N) +{ + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += - static_cast(data[rowStart * D + i] * data[rowStart * D + i]); + acc += static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } template -void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { +void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel - <<>>(dots, data, D, N); + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel<<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } template -void unaryAndGemv(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { - //computes a MLCommon unary op on data (squares it), then computes Ax +void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ + // computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns rmm::device_uvector sq(D * N, stream); raft::linalg::unaryOp( - thrust::raw_pointer_cast(sq.data()), data, D * N, - [] __device__(InType v) { return static_cast(v * v); }, stream); + thrust::raw_pointer_cast(sq.data()), + data, + D * N, + [] __device__(InType v) { return static_cast(v * v); }, + stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - rmm::device_uvector ones(N, stream); //column vector [1...1] + rmm::device_uvector ones(N, stream); // column vector [1...1] raft::linalg::unaryOp( - ones.data(), ones.data(), ones.size(), - [=] __device__(OutType input) { return 1; }, stream); + ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha, - sq.data(), D, ones.data(), 1, &beta, - dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv( + handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } template -void naiveReduction(OutType *dots, const InType *data, int D, int N, - bool rowMajor, bool alongRows, cudaStream_t stream) { +void naiveReduction(OutType* dots, + const InType* data, + int D, + int N, + bool rowMajor, + bool alongRows, + cudaStream_t stream) +{ if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) { diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index 4f761d39f6..ac387c16bb 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -32,15 +32,14 @@ struct stridedReductionInputs { }; template -void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream) { - stridedReduction(dots, data, cols, rows, (T)0, stream, false, - [] __device__(T in, int i) { return in * in; }); +void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream) +{ + stridedReduction( + dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; }); } template -class stridedReductionTest - : public ::testing::TestWithParam> { +class stridedReductionTest : public ::testing::TestWithParam> { public: stridedReductionTest() : params(::testing::TestWithParam>::GetParam()), @@ -48,15 +47,17 @@ class stridedReductionTest data(params.rows * params.cols, stream), dots_exp(params.cols, stream), // expected dot products (from test) dots_act(params.cols, stream) // actual dot products (from prim) - {} + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), - stream); //initialize matrix to random + stream); // initialize matrix to random unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream); stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream); @@ -71,35 +72,33 @@ class stridedReductionTest rmm::device_uvector data, dots_exp, dots_act; }; -const std::vector> inputsf = { - {0.00001f, 1024, 32, 1234ULL}, - {0.00001f, 1024, 64, 1234ULL}, - {0.00001f, 1024, 128, 1234ULL}, - {0.00001f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef stridedReductionTest stridedReductionTestF; -TEST_P(stridedReductionTestF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef stridedReductionTest stridedReductionTestD; -TEST_P(stridedReductionTestD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index 0a82da61c9..77c14a8a7b 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -24,39 +24,34 @@ namespace raft { namespace linalg { template -__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, - const Type *in2, int len) { +__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] - in2[idx]; } } template -void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveSubtractElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, - const Type in2, int len) { +__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2; - } + if (idx < len) { out[idx] = in1[idx] - in2; } } template -void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, - cudaStream_t stream) { +void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSubtractScalarKernel - <<>>(out, in1, in2, len); + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -68,7 +63,8 @@ struct SubtractInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SubtractInputs& dims) +{ return os; } @@ -81,10 +77,13 @@ class SubtractTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; r.uniform(in1.data(), len, T(-1.0), T(1.0), stream); @@ -108,35 +107,33 @@ class SubtractTest : public ::testing::TestWithParam> { rmm::device_uvector in1, in2, out_ref, out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef SubtractTest SubtractTestF; -TEST_P(SubtractTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } typedef SubtractTest SubtractTestD; -TEST_P(SubtractTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index 8ebbf19683..61c2c2e3db 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -35,7 +35,8 @@ struct SvdInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SvdInputs& dims) +{ return os; } @@ -51,10 +52,13 @@ class SvdTest : public ::testing::TestWithParam> { sing_vals_qr(params.n_col, stream), left_eig_vectors_ref(params.n_row * params.n_col, stream), right_eig_vectors_ref(params.n_col * params.n_col, stream), - sing_vals_ref(params.len, stream) {} + sing_vals_ref(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; @@ -63,26 +67,30 @@ class SvdTest : public ::testing::TestWithParam> { T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; raft::update_device(data.data(), data_h, len, stream); - int left_evl = params.n_row * params.n_col; + int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, - 0.488195, 0.110706, -0.865685}; + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685}; T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; T sing_vals_ref_h[] = {7.065283, 1.040081}; - raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, - left_evl, stream); - raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, - right_evl, stream); - raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, - stream); - - svdQR(handle, data.data(), params.n_row, params.n_col, sing_vals_qr.data(), - left_eig_vectors_qr.data(), right_eig_vectors_trans_qr.data(), true, - true, true, stream); + raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, left_evl, stream); + raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, right_evl, stream); + raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, stream); + + svdQR(handle, + data.data(), + params.n_row, + params.n_col, + sing_vals_qr.data(), + left_eig_vectors_qr.data(), + right_eig_vectors_trans_qr.data(), + true, + true, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -91,71 +99,75 @@ class SvdTest : public ::testing::TestWithParam> { cudaStream_t stream; SvdInputs params; - rmm::device_uvector data, left_eig_vectors_qr, right_eig_vectors_trans_qr, - sing_vals_qr, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; + rmm::device_uvector data, left_eig_vectors_qr, right_eig_vectors_trans_qr, sing_vals_qr, + left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; }; -const std::vector> inputsf2 = { - {0.00001f, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00001, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}}; typedef SvdTest SvdTestValF; -TEST_P(SvdTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(), + sing_vals_qr.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestValD; -TEST_P(SvdTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(), + sing_vals_qr.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecF; -TEST_P(SvdTestLeftVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(), - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(), + left_eig_vectors_qr.data(), + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecD; -TEST_P(SvdTestLeftVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(), - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(), + left_eig_vectors_qr.data(), + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecF; -TEST_P(SvdTestRightVecF, Result) { - ASSERT_TRUE(raft::devArrMatch( - right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(), - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(), + right_eig_vectors_trans_qr.data(), + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecD; -TEST_P(SvdTestRightVecD, Result) { - ASSERT_TRUE(raft::devArrMatch( - right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(), - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(), + right_eig_vectors_trans_qr.data(), + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2)); // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, // ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index 1d8ef08673..fde5599bc1 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -34,7 +34,8 @@ struct TranposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const TranposeInputs& dims) +{ return os; } @@ -46,10 +47,13 @@ class TransposeTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.len, stream), data_trans_ref(params.len, stream), - data_trans(params.len, stream) {} + data_trans(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = params.len; ASSERT(params.len == 9, "This test works only with len=9!"); T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; @@ -57,8 +61,7 @@ class TransposeTest : public ::testing::TestWithParam> { T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; raft::update_device(data_trans_ref.data(), data_ref_h, len, stream); - transpose(handle, data.data(), data_trans.data(), params.n_row, - params.n_col, stream); + transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream); transpose(data.data(), params.n_row, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -71,39 +74,41 @@ class TransposeTest : public ::testing::TestWithParam> { rmm::device_uvector data, data_trans, data_trans_ref; }; -const std::vector> inputsf2 = { - {0.1f, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}}; -const std::vector> inputsd2 = { - {0.1, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}}; typedef TransposeTest TransposeTestValF; -TEST_P(TransposeTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data_trans.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef TransposeTest TransposeTestValD; -TEST_P(TransposeTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data_trans.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 0fcf465150..ff6723973d 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -28,49 +28,49 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ if (in == nullptr) { auto op = [scalar] __device__(OutType * ptr, IdxType idx) { *ptr = static_cast(scalar * idx); }; writeOnlyUnaryOp(out, len, op, stream); } else { - auto op = [scalar] __device__(InType in) { - return static_cast(in * scalar); - }; + auto op = [scalar] __device__(InType in) { return static_cast(in * scalar); }; unaryOp(out, in, len, op, stream); } } template -class UnaryOpTest - : public ::testing::TestWithParam> { +class UnaryOpTest : public ::testing::TestWithParam> { public: UnaryOpTest() - : params(::testing::TestWithParam< - UnaryOpInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = params.len; r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - virtual void DoTest() { - auto len = params.len; + virtual void DoTest() + { + auto len = params.len; auto scalar = params.scalar; naiveScale(out_ref.data(), in.data(), scalar, len, stream); unaryOpLaunch(out.data(), in.data(), scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } protected: @@ -85,15 +85,15 @@ class UnaryOpTest template class WriteOnlyUnaryOpTest : public UnaryOpTest { protected: - void DoTest() override { - auto len = this->params.len; + void DoTest() override + { + auto len = this->params.len; auto scalar = this->params.scalar; - naiveScale(this->out_ref.data(), (OutType *)nullptr, scalar, len, - this->stream); - unaryOpLaunch(this->out.data(), (OutType *)nullptr, scalar, len, - this->stream); + naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream); CUDA_CHECK(cudaStreamSynchronize(this->stream)); - ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(), + ASSERT_TRUE(devArrMatch(this->out_ref.data(), + this->out.data(), this->params.len, CompareApprox(this->params.tolerance))); } @@ -103,8 +103,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { TEST_P(Name, Result) { DoTest(); } \ INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef UnaryOpTest UnaryOpTestF_i32; UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index be3f1124c5..3343389af8 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -24,8 +24,8 @@ namespace raft { namespace linalg { template -__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, - IdxType len) { +__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { if (in == nullptr) { @@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, } template -void naiveScale(OutType *out, const InType *in, InType scalar, int len, - cudaStream_t stream) { +void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveScaleKernel - <<>>(out, in, scalar, len); + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -56,8 +55,8 @@ struct UnaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const UnaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs& d) +{ return os; } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 7c7f29815b..7042f5b48d 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -24,53 +24,51 @@ namespace raft { namespace matrix { template -__global__ void nativePowerKernel(Type *in, Type *out, int len) { +__global__ void nativePowerKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] * in[idx]; - } + if (idx < len) { out[idx] = in[idx] * in[idx]; } } template -void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { +void naivePower(Type* in, Type* out, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { +__global__ void nativeSqrtKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = sqrt(in[idx]); - } + if (idx < len) { out[idx] = sqrt(in[idx]); } } template -void naiveSqrt(Type *in, Type *out, int len) { +void naiveSqrt(Type* in, Type* out, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, - int colCount) { +__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount) +{ int d_i = blockIdx.x * rowCount; int end = d_i + rowCount; if (blockIdx.x < colCount) { - Type max = 0.0; + Type max = 0.0; int max_index = 0; for (int i = d_i; i < end; i++) { Type val = in[i]; - if (val < 0.0) { - val = -val; - } + if (val < 0.0) { val = -val; } if (val > max) { - max = val; + max = val; max_index = i; } } @@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, } template -void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { +void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) +{ naiveSignFlipKernel<<>>(in, out, rowCount, colCount); CUDA_CHECK(cudaPeekAtLastError()); } @@ -103,7 +102,8 @@ struct MathInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MathInputs& dims) +{ return os; } @@ -126,12 +126,15 @@ class MathTest : public ::testing::TestWithParam> { out_recip(4, stream), in_smallzero(4, stream), out_smallzero(4, stream), - out_smallzero_ref(4, stream) {} + out_smallzero_ref(4, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { random::Rng r(params.seed); - int len = params.len; + int len = params.len; T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; update_device(in_ratio.data(), in_ratio_h, 4, stream); @@ -151,12 +154,11 @@ class MathTest : public ::testing::TestWithParam> { ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream); - naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, - params.n_col); + naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col); signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream); // default threshold is 1e-15 - std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; update_device(in_recip.data(), in_recip_h.data(), 4, stream); update_device(in_recip_ref.data(), in_recip_ref_h.data(), 4, stream); @@ -167,12 +169,11 @@ class MathTest : public ::testing::TestWithParam> { reciprocal(in_recip.data(), recip_scalar, 4, stream, true); - std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; update_device(in_smallzero.data(), in_small_val_zero_h.data(), 4, stream); - update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, - stream); + update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream); setSmallValuesZero(in_smallzero.data(), 4, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -183,137 +184,139 @@ class MathTest : public ::testing::TestWithParam> { cudaStream_t stream; MathInputs params; - rmm::device_uvector in_power, out_power_ref, in_sqrt, out_sqrt_ref, - in_ratio, out_ratio_ref, in_sign_flip, out_sign_flip_ref, in_recip, - in_recip_ref, out_recip, in_smallzero, out_smallzero, out_smallzero_ref; + rmm::device_uvector in_power, out_power_ref, in_sqrt, out_sqrt_ref, in_ratio, out_ratio_ref, + in_sign_flip, out_sign_flip_ref, in_recip, in_recip_ref, out_recip, in_smallzero, out_smallzero, + out_smallzero_ref; }; -const std::vector> inputsf = { - {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd = { - {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; typedef MathTest MathPowerTestF; -TEST_P(MathPowerTestF, Result) { - ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_power.data(), out_power_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathPowerTestD; -TEST_P(MathPowerTestD, Result) { - ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_power.data(), out_power_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestF; -TEST_P(MathSqrtTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestD; -TEST_P(MathSqrtTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestF; -TEST_P(MathRatioTestF, Result) { - ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestD; -TEST_P(MathRatioTestD, Result) { - ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestF; -TEST_P(MathSignFlipTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(), - params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_sign_flip.data(), + out_sign_flip_ref.data(), + params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestD; -TEST_P(MathSignFlipTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(), - params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_sign_flip.data(), + out_sign_flip_ref.data(), + params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestF; -TEST_P(MathReciprocalTestF, Result) { - ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestD; -TEST_P(MathReciprocalTestD, Result) { - ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestF; -TEST_P(MathSetSmallZeroTestF, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestD; -TEST_P(MathSetSmallZeroTestD, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd)); } // namespace matrix } // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index e247abad1e..6f052f7b46 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -33,7 +33,8 @@ struct MatrixInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatrixInputs& dims) +{ return os; } @@ -45,10 +46,13 @@ class MatrixTest : public ::testing::TestWithParam> { stream(handle.get_stream()), in1(params.n_row * params.n_col, stream), in2(params.n_row * params.n_col, stream), - in1_revr(params.n_row * params.n_col, stream) {} + in1_revr(params.n_row * params.n_col, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; r.uniform(in1.data(), len, T(-1.0), T(1.0), stream); @@ -72,87 +76,84 @@ class MatrixTest : public ::testing::TestWithParam> { const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 4, 4, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 4, 4, 1234ULL}}; typedef MatrixTest MatrixTestF; -TEST_P(MatrixTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(), +TEST_P(MatrixTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(in1.data(), + in2.data(), params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } typedef MatrixTest MatrixTestD; -TEST_P(MatrixTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(), +TEST_P(MatrixTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(in1.data(), + in2.data(), params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2)); template class MatrixCopyRowsTest : public ::testing::Test { - using math_t = typename std::tuple_element<0, T>::type; - using idx_t = typename std::tuple_element<1, T>::type; + using math_t = typename std::tuple_element<0, T>::type; + using idx_t = typename std::tuple_element<1, T>::type; using idx_array_t = typename std::tuple_element<2, T>::type; protected: MatrixCopyRowsTest() : input(n_cols * n_rows, handle.get_stream()), indices(n_selected, handle.get_stream()), - output(n_cols * n_selected, handle.get_stream()) { + output(n_cols * n_selected, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, - ptr); + thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testCopyRows() { - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, false); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(), - n_selected * n_cols, - raft::Compare())); - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, true); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(), - n_selected * n_cols, - raft::Compare())); + void testCopyRows() + { + copyRows( + input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare())); + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare())); } protected: raft::handle_t handle; cudaStream_t stream; - int n_rows = 10; - int n_cols = 3; + int n_rows = 10; + int n_cols = 3; int n_selected = 5; - idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; - math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, - 17, 19, 20, 23, 24, 27, 29}; - math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, - 14, 21, 22, 23, 27, 28, 29}; + idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; + math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29}; + math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29}; rmm::device_uvector input; rmm::device_uvector output; rmm::device_uvector indices; }; -using TypeTuple = - ::testing::Types, std::tuple, - std::tuple, - std::tuple>; +using TypeTuple = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple); TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); } diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index fe42cea8b3..5cfcc910fd 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -25,7 +25,8 @@ namespace raft { namespace mr { namespace device { -TEST(Raft, DeviceBufferAlloc) { +TEST(Raft, DeviceBufferAlloc) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction @@ -51,13 +52,14 @@ TEST(Raft, DeviceBufferAlloc) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceBufferZeroResize) { +TEST(Raft, DeviceBufferZeroResize) +{ // Create a limiting_resource_adaptor to track allocations - auto curr_mr = dynamic_cast( - rmm::mr::get_current_device_resource()); - auto limit_mr = std::make_shared< - rmm::mr::limiting_resource_adaptor>(curr_mr, - 1000); + auto curr_mr = + dynamic_cast(rmm::mr::get_current_device_resource()); + auto limit_mr = + std::make_shared>(curr_mr, + 1000); rmm::mr::set_current_device_resource(limit_mr.get()); diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index 953f65ddfb..aadf05285c 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -24,7 +24,8 @@ namespace raft { namespace mr { namespace host { -TEST(Raft, HostBuffer) { +TEST(Raft, HostBuffer) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceToHostBuffer) { +TEST(Raft, DeviceToHostBuffer) +{ auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK( - cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 781e6d1d3f..90a6d7bd87 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -61,7 +61,8 @@ namespace mst { // Sequential prims function // Returns total weight of MST template -weight_t prims(CSRHost &csr_h) { +weight_t prims(CSRHost& csr_h) +{ std::size_t n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; @@ -70,19 +71,18 @@ weight_t prims(CSRHost &csr_h) { for (std::size_t i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = static_cast(std::numeric_limits::max()); + curr_edge[i] = static_cast(std::numeric_limits::max()); } curr_edge[0] = 0; // function to pick next min vertex-edge - auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, - auto n_vertices) { + auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) { auto min = static_cast(std::numeric_limits::max()); vertex_t min_vertex{}; for (std::size_t v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { - min = curr_edge[v]; + min = curr_edge[v]; min_vertex = v; } } @@ -98,14 +98,13 @@ weight_t prims(CSRHost &csr_h) { active_vertex[curr_v] = true; // set to active // iterate through edges of current active vertex - auto edge_st = csr_h.offsets[curr_v]; + auto edge_st = csr_h.offsets[curr_v]; auto edge_end = csr_h.offsets[curr_v + 1]; for (auto e = edge_st; e < edge_end; e++) { // put edges to be considered for next iteration auto neighbor_idx = csr_h.indices[e]; - if (!active_vertex[neighbor_idx] && - csr_h.weights[e] < curr_edge[neighbor_idx]) { + if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) { curr_edge[neighbor_idx] = csr_h.weights[e]; } } @@ -121,15 +120,15 @@ weight_t prims(CSRHost &csr_h) { } template -class MSTTest - : public ::testing::TestWithParam> { +class MSTTest : public ::testing::TestWithParam> { protected: std::pair, raft::Graph_COO> - mst_gpu() { - edge_t *offsets = static_cast(csr_d.offsets.data()); - vertex_t *indices = static_cast(csr_d.indices.data()); - weight_t *weights = static_cast(csr_d.weights.data()); + mst_gpu() + { + edge_t* offsets = static_cast(csr_d.offsets.data()); + vertex_t* indices = static_cast(csr_d.indices.data()); + weight_t* weights = static_cast(csr_d.weights.data()); v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); @@ -138,89 +137,95 @@ class MSTTest rmm::device_uvector mst_dst(2 * v - 2, handle.get_stream()); rmm::device_uvector color(v, handle.get_stream()); - CUDA_CHECK( - cudaMemsetAsync(mst_src.data(), std::numeric_limits::max(), - mst_src.size() * sizeof(vertex_t), handle.get_stream())); - CUDA_CHECK( - cudaMemsetAsync(mst_dst.data(), std::numeric_limits::max(), - mst_dst.size() * sizeof(vertex_t), handle.get_stream())); - CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), + CUDA_CHECK(cudaMemsetAsync(mst_src.data(), + std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), + handle.get_stream())); + CUDA_CHECK(cudaMemsetAsync(mst_dst.data(), + std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK( + cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream())); - vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); + vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); if (iterations == 0) { MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0); auto symmetric_result = symmetric_solver.solve(); MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } else { - MST_solver intermediate_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, iterations); + MST_solver intermediate_solver(handle, + offsets, + indices, + weights, + v, + e, + color_ptr, + handle.get_stream(), + true, + true, + iterations); auto intermediate_result = intermediate_solver.solve(); MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, false, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0); auto symmetric_result = symmetric_solver.solve(); // symmetric_result.n_edges += intermediate_result.n_edges; - auto total_edge_size = - symmetric_result.n_edges + intermediate_result.n_edges; + auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges; symmetric_result.src.resize(total_edge_size, handle.get_stream()); symmetric_result.dst.resize(total_edge_size, handle.get_stream()); symmetric_result.weights.resize(total_edge_size, handle.get_stream()); raft::copy(symmetric_result.src.data() + symmetric_result.n_edges, - intermediate_result.src.data(), intermediate_result.n_edges, + intermediate_result.src.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges, - intermediate_result.dst.data(), intermediate_result.n_edges, + intermediate_result.dst.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges, intermediate_result.weights.data(), - intermediate_result.n_edges, handle.get_stream()); + intermediate_result.n_edges, + handle.get_stream()); symmetric_result.n_edges = total_edge_size; MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } } - void SetUp() override { - mst_input = ::testing::TestWithParam< - MSTTestInput>::GetParam(); + void SetUp() override + { + mst_input = ::testing::TestWithParam>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = rmm::device_buffer( - mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); - csr_d.indices = rmm::device_buffer( - mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); - csr_d.weights = rmm::device_buffer( - mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); + csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), + handle.get_stream()); + csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), + handle.get_stream()); + csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), + handle.get_stream()); } void TearDown() override {} @@ -272,41 +277,68 @@ const std::vector> csr_in_h = { const std::vector> csr_in4_h = { {{0, 3, 5, 8, 10, 12, 14, 16}, {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; // singletons const std::vector> csr_in5_h = { {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16}, {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; typedef MSTTest MSTTestSequential; -TEST_P(MSTTestSequential, Sequential) { - auto results_pair = mst_gpu(); - auto &symmetric_result = results_pair.first; - auto &non_symmetric_result = results_pair.second; +TEST_P(MSTTestSequential, Sequential) +{ + auto results_pair = mst_gpu(); + auto& symmetric_result = results_pair.first; + auto& non_symmetric_result = results_pair.second; // do assertions here // in this case, running sequential MST auto prims_result = prims(mst_input.csr_h); - auto symmetric_sum = - thrust::reduce(thrust::device, symmetric_result.weights.data(), - symmetric_result.weights.data() + symmetric_result.n_edges); - auto non_symmetric_sum = thrust::reduce( - thrust::device, non_symmetric_result.weights.data(), - non_symmetric_result.weights.data() + non_symmetric_result.n_edges); - - ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, - raft::CompareApprox(0.1))); - ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, - raft::CompareApprox(0.1))); + auto symmetric_sum = thrust::reduce(thrust::device, + symmetric_result.weights.data(), + symmetric_result.weights.data() + symmetric_result.n_edges); + auto non_symmetric_sum = + thrust::reduce(thrust::device, + non_symmetric_result.weights.data(), + non_symmetric_result.weights.data() + non_symmetric_result.n_edges); + + ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox(0.1))); + ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox(0.1))); } -INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, - ::testing::ValuesIn(csr_in_h)); +INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h)); } // namespace mst } // namespace raft diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu index 92976e5c61..c76064ade7 100644 --- a/cpp/test/pow2_utils.cu +++ b/cpp/test/pow2_utils.cu @@ -24,7 +24,8 @@ struct Pow2Test : public ::testing::Test { typedef Pow2 P; std::vector data; - void SetUp() override { + void SetUp() override + { std::vector pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623}; data.insert(data.end(), pos.begin(), pos.end()); if constexpr (std::is_signed::value) { @@ -35,7 +36,8 @@ struct Pow2Test : public ::testing::Test { data.push_back(std::numeric_limits::max()); } - void quotRem() { + void quotRem() + { for (auto x : data) { ASSERT_EQ(P::quot(x), x / P::Value) << " where x = " << x; ASSERT_EQ(P::rem(x), x % P::Value) << " where x = " << x; @@ -43,31 +45,32 @@ struct Pow2Test : public ::testing::Test { } } - void divMod() { + void divMod() + { for (auto x : data) { ASSERT_GE(P::mod(x), 0) << " where x = " << x; ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x)); } } - void round() { + void round() + { for (auto x : data) { - if (x <= std::numeric_limits::max() - TargetT(P::Value)) - ASSERT_GE(P::roundUp(x), x); + if (x <= std::numeric_limits::max() - TargetT(P::Value)) ASSERT_GE(P::roundUp(x), x); if (x >= std::numeric_limits::min() + TargetT(P::Value)) ASSERT_LE(P::roundDown(x), x); ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << " where x = " << x; - ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) - << " where x = " << x; + ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) << " where x = " << x; } } - void alignment() { + void alignment() + { for (auto x : data) { ASSERT_TRUE(P::areSameAlignOffsets(x, x)); if (x <= std::numeric_limits::max() - TargetT(P::Value)) { ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value))); - int aligned_count = 0; + int aligned_count = 0; int same_aligned_count = 0; for (int i = 0; i < int(P::Value); i++) { aligned_count += P::isAligned(x + i); @@ -97,10 +100,11 @@ TEST_IT(Pow2_u64_i32_128); TEST_IT(Pow2_ll_u16_32); TEST_IT(Pow2_i32_u64_16); -TEST(Pow2, pointers) { +TEST(Pow2, pointers) +{ typedef Pow2<32UL> P; for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) { - auto *p = reinterpret_cast(16345 + i); + auto* p = reinterpret_cast(16345 + i); ASSERT_GE(P::roundUp(p), p); ASSERT_LE(P::roundDown(p), p); } diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index 810d6cb871..69dc146486 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -40,12 +40,13 @@ enum RandomType { }; template -__global__ void meanKernel(T* out, const T* data, int len) { +__global__ void meanKernel(T* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x + blockIdx.x * blockDim.x; - T val = tid < len ? data[tid] : T(0); - T x = BlockReduce(temp_storage).Sum(val); + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); __syncthreads(); T xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -72,7 +73,8 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -86,47 +88,36 @@ class RngTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(0, stream), - stats(2, stream) { + stats(2, stream) + { data.resize(params.len, stream); CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); } protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; Rng r(params.seed, params.gtype); switch (params.type) { - case RNG_Normal: - r.normal(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break; case RNG_LogNormal: r.lognormal(data.data(), params.len, params.start, params.end, stream); break; - case RNG_Uniform: - r.uniform(data.data(), params.len, params.start, params.end, stream); - break; - case RNG_Gumbel: - r.gumbel(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Uniform: r.uniform(data.data(), params.len, params.start, params.end, stream); break; + case RNG_Gumbel: r.gumbel(data.data(), params.len, params.start, params.end, stream); break; case RNG_Logistic: r.logistic(data.data(), params.len, params.start, params.end, stream); break; - case RNG_Exp: - r.exponential(data.data(), params.len, params.start, stream); - break; - case RNG_Rayleigh: - r.rayleigh(data.data(), params.len, params.start, stream); - break; - case RNG_Laplace: - r.laplace(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Exp: r.exponential(data.data(), params.len, params.start, stream); break; + case RNG_Rayleigh: r.rayleigh(data.data(), params.len, params.start, stream); break; + case RNG_Laplace: r.laplace(data.data(), params.len, params.start, params.end, stream); break; }; static const int threads = 128; - meanKernel - <<>>( - stats.data(), data.data(), params.len); + meanKernel<<>>( + stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -134,18 +125,18 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { switch (params.type) { case RNG_Normal: meanvar[0] = params.start; meanvar[1] = params.end * params.end; break; case RNG_LogNormal: { - auto var = params.end * params.end; - auto mu = params.start; + auto var = params.end * params.end; + auto mu = params.start; meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = - (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -169,8 +160,7 @@ class RngTest : public ::testing::TestWithParam> { break; case RNG_Rayleigh: meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); - meanvar[1] = - ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: meanvar[0] = params.start; @@ -264,13 +254,12 @@ const std::vector> inputsf = { {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestF, Result) { +TEST_P(RngTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); @@ -326,13 +315,12 @@ const std::vector> inputsd = { {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestD, Result) { +TEST_P(RngTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); @@ -340,7 +328,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // Test for expected variance in mean calculations template -T quick_mean(const std::vector& d) { +T quick_mean(const std::vector& d) +{ T acc = T(0); for (const auto& di : d) { acc += di; @@ -349,8 +338,9 @@ T quick_mean(const std::vector& d) { } template -T quick_std(const std::vector& d) { - T acc = T(0); +T quick_std(const std::vector& d) +{ + T acc = T(0); T d_mean = quick_mean(d); for (const auto& di : d) { acc += ((di - d_mean) * (di - d_mean)); @@ -359,7 +349,8 @@ T quick_std(const std::vector& d) { } template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { +std::ostream& operator<<(std::ostream& out, const std::vector& v) +{ if (!v.empty()) { out << '['; std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); @@ -374,13 +365,14 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { // experiments computing the mean, giving us a distribution of the mean // itself. The mean error is simply the standard deviation of this // distribution (the standard deviation of the mean). -TEST(Rng, MeanError) { +TEST(Rng, MeanError) +{ timeb time_struct; ftime(&time_struct); - int seed = time_struct.millitm; - int num_samples = 1024; + int seed = time_struct.millitm; + int num_samples = 1024; int num_experiments = 1024; - int len = num_samples * num_experiments; + int len = num_samples * num_experiments; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -393,22 +385,26 @@ TEST(Rng, MeanError) { Rng r(seed, rtype); r.normal(data.data(), len, 3.3f, 0.23f, stream); // r.uniform(data, len, -1.0, 2.0); - raft::stats::mean(mean_result.data(), data.data(), num_samples, - num_experiments, false, false, stream); - raft::stats::stddev(std_result.data(), data.data(), mean_result.data(), - num_samples, num_experiments, false, false, stream); + raft::stats::mean( + mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream); + raft::stats::stddev(std_result.data(), + data.data(), + mean_result.data(), + num_samples, + num_experiments, + false, + false, + stream); std::vector h_mean_result(num_experiments); std::vector h_std_result(num_experiments); - update_host(h_mean_result.data(), mean_result.data(), num_experiments, - stream); - update_host(h_std_result.data(), std_result.data(), num_experiments, - stream); + update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream); + update_host(h_std_result.data(), std_result.data(), num_experiments, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error - auto d_std_of_mean = quick_std(h_mean_result); - auto d_std = quick_mean(h_std_result); + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); // std::cout << "measured mean error: " << d_std_of_mean << "\n"; @@ -417,8 +413,7 @@ TEST(Rng, MeanError) { auto diff_expected_vs_measured_mean_error = std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); - ASSERT_TRUE( - (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } CUDA_CHECK(cudaStreamDestroy(stream)); @@ -431,18 +426,19 @@ class ScaledBernoulliTest : public ::testing::Test { ScaledBernoulliTest() : stream(handle.get_stream()), data(len, stream) {} protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream); } - void rangeCheck() { + void rangeCheck() + { T* h_data = new T[len]; update_host(h_data, data.data(), len, stream); - ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { - return a < -scale || a > scale; - })); + ASSERT_TRUE( + std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); delete[] h_data; } @@ -464,13 +460,15 @@ class BernoulliTest : public ::testing::Test { BernoulliTest() : stream(handle.get_stream()), data(len, stream) {} protected: - void SetUp() override { + void SetUp() override + { Rng r(42); r.bernoulli(data.data(), len, T(0.5), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void trueFalseCheck() { + void trueFalseCheck() + { // both true and false values must be present bool* h_data = new bool[len]; update_host(h_data, data.data(), len, stream); @@ -502,38 +500,39 @@ struct RngNormalTableInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const RngNormalTableInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs& dims) +{ return os; } template -class RngNormalTableTest - : public ::testing::TestWithParam> { +class RngNormalTableTest : public ::testing::TestWithParam> { public: RngNormalTableTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), stats(2, stream), - mu_vec(params.cols, stream) { + mu_vec(params.cols, stream) + { CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); } protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - int len = params.rows * params.cols; + int len = params.rows * params.cols; Rng r(params.seed, params.gtype); r.fill(mu_vec.data(), params.cols, params.mu, stream); T* sigma_vec = nullptr; - r.normalTable(data.data(), params.rows, params.cols, mu_vec.data(), - sigma_vec, params.sigma, stream); + r.normalTable( + data.data(), params.rows, params.cols, mu_vec.data(), sigma_vec, params.sigma, stream); static const int threads = 128; - meanKernel<<>>( - stats.data(), data.data(), len); + meanKernel + <<>>(stats.data(), data.data(), len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; @@ -541,7 +540,8 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { meanvar[0] = params.mu; meanvar[1] = params.sigma * params.sigma; } @@ -565,16 +565,14 @@ const std::vector> inputsf_t = { {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestF, Result) { +TEST_P(RngNormalTableTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, - ::testing::ValuesIn(inputsf_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t)); typedef RngNormalTableTest RngNormalTableTestD; const std::vector> inputsd_t = { @@ -584,16 +582,14 @@ const std::vector> inputsd_t = { {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestD, Result) { +TEST_P(RngNormalTableTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, - ::testing::ValuesIn(inputsd_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t)); struct RngAffineInputs { int n; @@ -602,13 +598,15 @@ struct RngAffineInputs { class RngAffineTest : public ::testing::TestWithParam { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam::GetParam(); Rng r(params.seed); r.affine_transform_params(params.n, a, b); } - void check() { + void check() + { ASSERT_TRUE(gcd(a, params.n) == 1); ASSERT_TRUE(0 <= b && b < params.n); } @@ -619,13 +617,17 @@ class RngAffineTest : public ::testing::TestWithParam { }; // RngAffineTest const std::vector inputs_affine = { - {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, - {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, - {2568, 123456ULL}, {2568, 1234567890ULL}, + {100, 123456ULL}, + {100, 1234567890ULL}, + {101, 123456ULL}, + {101, 1234567890ULL}, + {7, 123456ULL}, + {7, 1234567890ULL}, + {2568, 123456ULL}, + {2568, 1234567890ULL}, }; TEST_P(RngAffineTest, Result) { check(); } -INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, - ::testing::ValuesIn(inputs_affine)); +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine)); } // namespace random } // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index cef2d47276..f0331b7746 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -29,12 +29,13 @@ using namespace raft::random::detail; enum RandomType { RNG_Uniform }; template -__global__ void meanKernel(float *out, const T *data, int len) { +__global__ void meanKernel(float* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; float val = tid < len ? data[tid] : T(0); - float x = BlockReduce(temp_storage).Sum(val); + float x = BlockReduce(temp_storage).Sum(val); __syncthreads(); float xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -61,7 +62,8 @@ struct RngInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -72,13 +74,15 @@ class RngTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(0, stream), - stats(2, stream) { + stats(2, stream) + { data.resize(params.len, stream); CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream)); } protected: - void SetUp() override { + void SetUp() override + { Rng r(params.seed, params.gtype); switch (params.type) { @@ -87,9 +91,8 @@ class RngTest : public ::testing::TestWithParam> { break; }; static const int threads = 128; - meanKernel - <<>>( - stats.data(), data.data(), params.len); + meanKernel<<>>( + stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -97,7 +100,8 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(float meanvar[2]) { + void getExpectedMeanVar(float meanvar[2]) + { switch (params.type) { case RNG_Uniform: meanvar[0] = (params.start + params.end) * 0.5f; @@ -125,13 +129,12 @@ const std::vector> inputs_u32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU32, Result) { +TEST_P(RngTestU32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); @@ -143,13 +146,12 @@ const std::vector> inputs_u64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU64, Result) { +TEST_P(RngTestU64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); @@ -161,13 +163,12 @@ const std::vector> inputs_s32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS32, Result) { +TEST_P(RngTestS32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); @@ -179,13 +180,12 @@ const std::vector> inputs_s64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS64, Result) { +TEST_P(RngTestS64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index 1d33f08c62..a681bbb07d 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -40,7 +40,8 @@ struct SWoRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) +{ return os; } @@ -53,20 +54,27 @@ class SWoRTest : public ::testing::TestWithParam> { in(params.len, stream), wts(params.len, stream), out(params.sampledLen, stream), - outIdx(params.sampledLen, stream) {} + outIdx(params.sampledLen, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { Rng r(params.seed, params.gtype); h_outIdx.resize(params.sampledLen); r.uniform(in.data(), params.len, T(-1.0), T(1.0), stream); r.uniform(wts.data(), params.len, T(1.0), T(2.0), stream); if (params.largeWeightIndex >= 0) { - update_device(wts.data() + params.largeWeightIndex, ¶ms.largeWeight, - 1, stream); + update_device(wts.data() + params.largeWeightIndex, ¶ms.largeWeight, 1, stream); } - r.sampleWithoutReplacement(handle, out.data(), outIdx.data(), in.data(), - wts.data(), params.sampledLen, params.len, + r.sampleWithoutReplacement(handle, + out.data(), + outIdx.data(), + in.data(), + wts.data(), + params.sampledLen, + params.len, stream); update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -145,14 +153,14 @@ const std::vector> inputsf = { {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestF, Result) { +TEST_P(SWoRTestF, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -160,9 +168,7 @@ TEST_P(SWoRTestF, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); @@ -229,14 +235,14 @@ const std::vector> inputsd = { {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestD, Result) { +TEST_P(SWoRTestD, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -244,9 +250,7 @@ TEST_P(SWoRTestD, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index a5f08489f1..d7e11e8fef 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -44,12 +44,10 @@ struct CSRAddInputs { }; template -class CSRAddTest - : public ::testing::TestWithParam> { +class CSRAddTest : public ::testing::TestWithParam> { public: CSRAddTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), ind_a(params.matrix_a.row_ind.size(), stream), ind_ptr_a(params.matrix_a.row_ind_ptr.size(), stream), @@ -62,59 +60,69 @@ class CSRAddTest values_verify(params.matrix_verify.row_ind_ptr.size(), stream), ind_result(params.matrix_a.row_ind.size(), stream), ind_ptr_result(params.matrix_verify.row_ind_ptr.size(), stream), - values_result(params.matrix_verify.row_ind_ptr.size(), stream) {} + values_result(params.matrix_verify.row_ind_ptr.size(), stream) + { + } protected: - void SetUp() override { - n_rows = params.matrix_a.row_ind.size(); - nnz_a = params.matrix_a.row_ind_ptr.size(); - nnz_b = params.matrix_b.row_ind_ptr.size(); + void SetUp() override + { + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); nnz_result = params.matrix_verify.row_ind_ptr.size(); } - void Run() { - raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), - nnz_a, stream); - raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, - stream); - - raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), - nnz_b, stream); - raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, - stream); - - raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), - n_rows, stream); - raft::update_device(ind_ptr_verify.data(), - params.matrix_verify.row_ind_ptr.data(), nnz_result, - stream); - raft::update_device(values_verify.data(), - params.matrix_verify.values.data(), nnz_result, stream); - - Index_ nnz = linalg::csr_add_calc_inds( - ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(), - ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(), - stream); + void Run() + { + raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), nnz_a, stream); + raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, stream); + + raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), nnz_b, stream); + raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, stream); + + raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), n_rows, stream); + raft::update_device( + ind_ptr_verify.data(), params.matrix_verify.row_ind_ptr.data(), nnz_result, stream); + raft::update_device( + values_verify.data(), params.matrix_verify.values.data(), nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds(ind_a.data(), + ind_ptr_a.data(), + values_a.data(), + nnz_a, + ind_b.data(), + ind_ptr_b.data(), + values_b.data(), + nnz_b, + n_rows, + ind_result.data(), + stream); ASSERT_TRUE(nnz == nnz_result); - ASSERT_TRUE(raft::devArrMatch(ind_verify.data(), ind_result.data(), - n_rows, raft::Compare())); - - linalg::csr_add_finalize( - ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(), - ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(), - ind_ptr_result.data(), values_result.data(), stream); - - ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify.data(), - ind_ptr_result.data(), nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(values_verify.data(), - values_result.data(), nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + ind_verify.data(), ind_result.data(), n_rows, raft::Compare())); + + linalg::csr_add_finalize(ind_a.data(), + ind_ptr_a.data(), + values_a.data(), + nnz_a, + ind_b.data(), + ind_ptr_b.data(), + values_b.data(), + nnz_b, + n_rows, + ind_result.data(), + ind_ptr_result.data(), + values_result.data(), + stream); + + ASSERT_TRUE(raft::devArrMatch( + ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + values_verify.data(), values_result.data(), nnz, raft::Compare())); } protected: @@ -123,8 +131,8 @@ class CSRAddTest CSRAddInputs params; Index_ n_rows, nnz_a, nnz_b, nnz_result; - rmm::device_uvector ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, - ind_ptr_b, ind_ptr_verify, ind_ptr_result; + rmm::device_uvector ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, ind_ptr_b, + ind_ptr_verify, ind_ptr_result; rmm::device_uvector values_a, values_b, values_verify, values_result; }; @@ -157,10 +165,8 @@ const std::vector> csradd_inputs_d = { {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, }; -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, - ::testing::ValuesIn(csradd_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, - ::testing::ValuesIn(csradd_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index dd6ba1479e..5e4b164b37 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -50,24 +50,22 @@ struct ConnectComponentsInputs { }; template -class ConnectComponentsTest : public ::testing::TestWithParam< - ConnectComponentsInputs> { +class ConnectComponentsTest + : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; auto stream = handle.get_stream(); - params = ::testing::TestWithParam< - ConnectComponentsInputs>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); raft::sparse::COO out_edges(handle.get_stream()); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); rmm::device_uvector indptr(params.n_row + 1, stream); @@ -76,44 +74,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam< */ raft::sparse::COO knn_graph_coo(stream); - raft::sparse::selection::knn_graph( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c); + raft::sparse::selection::knn_graph(handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + knn_graph_coo, + params.c); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream); /** * 2. Construct MST, sorted by weights */ rmm::device_uvector colors(params.n_row, stream); - auto mst_coo = raft::mst::mst( - handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(), - params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true); + auto mst_coo = raft::mst::mst(handle, + indptr.data(), + knn_graph_coo.cols(), + knn_graph_coo.vals(), + params.n_row, + knn_graph_coo.nnz, + colors.data(), + stream, + false, + true); /** * 3. connect_components to fix connectivities */ - raft::linkage::FixConnectivitiesRedOp red_op( - colors.data(), params.n_row); + raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, - red_op); + handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); /** * Construct final edge list */ rmm::device_uvector indptr2(params.n_row + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, - indptr2.data(), params.n_row + 1, - stream); + raft::sparse::convert::sorted_coo_to_csr( + out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream); - auto output_mst = raft::mst::mst( - handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, - out_edges.nnz, colors.data(), stream, false, false); + auto output_mst = raft::mst::mst(handle, + indptr2.data(), + out_edges.cols(), + out_edges.vals(), + params.n_row, + out_edges.nnz, + colors.data(), + stream, + false, + false); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -135,366 +147,199 @@ const std::vector> fix_conn_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, -1}, // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, -4}}; typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) { +TEST_P(ConnectComponentsTestF_Int, Result) +{ /** - * Verify the src & dst vertices on each edge have different colors - */ + * Verify the src & dst vertices on each edge have different colors + */ EXPECT_TRUE(final_edges == params.n_row - 1); } -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int, +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, + ConnectComponentsTestF_Int, ::testing::ValuesIn(fix_conn_inputsf2)); }; // namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index d30114bbcb..2028513010 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -44,23 +44,25 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { stream(handle.get_stream()), ex_scan(params.ex_scan.size(), stream), verify(params.verify.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: void SetUp() override {} - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.verify.size(); + Index_ nnz = params.verify.size(); raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - convert::csr_to_coo(ex_scan.data(), n_rows, result.data(), nnz, - stream); + convert::csr_to_coo(ex_scan.data(), n_rows, result.data(), nnz, stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare(), stream)); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare(), stream)); } protected: @@ -86,9 +88,11 @@ const std::vector> csrtocoo_inputs_64 = { {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestI, ::testing::ValuesIn(csrtocoo_inputs_32)); -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestL, ::testing::ValuesIn(csrtocoo_inputs_64)); } // namespace sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index cd665934c2..18e8b874bb 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -36,14 +36,13 @@ struct SparseConvertCSRInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseConvertCSRInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs& dims) +{ return os; } template -class SparseConvertCSRTest - : public ::testing::TestWithParam> { +class SparseConvertCSRTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -53,18 +52,18 @@ class SparseConvertCSRTest SparseConvertCSRInputs params; }; -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseConvertCSRTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) { +TEST_P(SortedCOOToCSR, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int nnz = 8; - int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *exp_h = new int[4]{0, 2, 4, 6}; + int* in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* exp_h = new int[4]{0, 2, 4, 6}; rmm::device_uvector in(nnz, stream); rmm::device_uvector exp(4, stream); @@ -78,8 +77,7 @@ TEST_P(SortedCOOToCSR, Result) { convert::sorted_coo_to_csr(in.data(), nnz, out.data(), 4, stream); - ASSERT_TRUE( - raft::devArrMatch(out.data(), exp.data(), 4, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.data(), exp.data(), 4, raft::Compare())); cudaStreamDestroy(stream); @@ -87,8 +85,7 @@ TEST_P(SortedCOOToCSR, Result) { delete[] exp_h; } -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf)); /******************************** adj graph ********************************/ @@ -102,8 +99,7 @@ struct CSRAdjGraphInputs { }; template -class CSRAdjGraphTest - : public ::testing::TestWithParam> { +class CSRAdjGraphTest : public ::testing::TestWithParam> { public: CSRAdjGraphTest() : params(::testing::TestWithParam>::GetParam()), @@ -111,24 +107,27 @@ class CSRAdjGraphTest row_ind(params.n_rows, stream), adj(params.n_rows * params.n_cols, stream), result(params.verify.size(), stream), - verify(params.verify.size(), stream) {} + verify(params.verify.size(), stream) + { + } protected: void SetUp() override { nnz = params.verify.size(); } - void Run() { - raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, + void Run() + { + raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, stream); + raft::update_device(adj.data(), + reinterpret_cast(params.adj.data()), + params.n_rows * params.n_cols, stream); - raft::update_device(adj.data(), reinterpret_cast(params.adj.data()), - params.n_rows * params.n_cols, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - convert::csr_adj_graph_batched(row_ind.data(), params.n_cols, - nnz, params.n_rows, adj.data(), - result.data(), stream); + convert::csr_adj_graph_batched( + row_ind.data(), params.n_cols, nnz, params.n_rows, adj.data(), result.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -162,9 +161,11 @@ const std::vector> csradjgraph_inputs_l = { {0, 1, 2, 0, 1, 2, 0, 1, 2}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestI, ::testing::ValuesIn(csradjgraph_inputs_i)); -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestL, ::testing::ValuesIn(csradjgraph_inputs_l)); } // namespace sparse diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 33893649bd..16372dc0f6 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -47,18 +47,16 @@ struct CSRRowSliceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRRowSliceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs& dims) +{ return os; } template -class CSRRowSliceTest - : public ::testing::TestWithParam> { +class CSRRowSliceTest : public ::testing::TestWithParam> { public: CSRRowSliceTest() - : params(::testing::TestWithParam< - CSRRowSliceInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), indptr(0, stream), indices(0, stream), @@ -68,7 +66,8 @@ class CSRRowSliceTest out_data_ref(0, stream), out_indptr(0, stream), out_indices(0, stream), - out_data(0, stream) { + out_data(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -81,54 +80,65 @@ class CSRRowSliceTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; - - update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref.data(), out_data_ref_h.data(), - out_data_ref_h.size(), stream); + std::vector out_data_ref_h = params.out_data_ref_h; + + update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void SetUp() override { + void SetUp() override + { make_data(); int csr_start_offset; int csr_stop_offset; - raft::sparse::op::csr_row_slice_indptr( - params.start_row, params.stop_row, indptr.data(), out_indptr.data(), - &csr_start_offset, &csr_stop_offset, stream); - - raft::sparse::op::csr_row_slice_populate( - csr_start_offset, csr_stop_offset, indices.data(), data.data(), - out_indices.data(), out_data.data(), stream); + raft::sparse::op::csr_row_slice_indptr(params.start_row, + params.stop_row, + indptr.data(), + out_indptr.data(), + &csr_start_offset, + &csr_stop_offset, + stream); + + raft::sparse::op::csr_row_slice_populate(csr_start_offset, + csr_stop_offset, + indices.data(), + data.data(), + out_indices.data(), + out_data.data(), + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_indptr.data(), + out_indptr_ref.data(), params.out_indptr_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(), + ASSERT_TRUE(devArrMatch(out_indices.data(), + out_indices_ref.data(), params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(), - params.out_data_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare())); } protected: @@ -173,8 +183,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRRowSliceTest CSRRowSliceTestF; TEST_P(CSRRowSliceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 1a206c8499..85f00cdd27 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -45,24 +45,23 @@ struct CSRToDenseInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRToDenseInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs& dims) +{ return os; } template -class CSRToDenseTest - : public ::testing::TestWithParam> { +class CSRToDenseTest : public ::testing::TestWithParam> { public: CSRToDenseTest() - : params(::testing::TestWithParam< - CSRToDenseInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(raft_handle.get_stream()), indptr(0, stream), indices(0, stream), data(0, stream), out_ref(0, stream), - out(0, stream) { + out(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -71,10 +70,11 @@ class CSRToDenseTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); @@ -86,22 +86,31 @@ class CSRToDenseTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void SetUp() override { + void SetUp() override + { CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - convert::csr_to_dense(handle, params.nrows, params.ncols, indptr.data(), - indices.data(), data.data(), params.nrows, out.data(), - stream, true); + convert::csr_to_dense(handle, + params.nrows, + params.ncols, + indptr.data(), + indices.data(), + data.data(), + params.nrows, + out.data(), + stream, + true); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void compare() { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), - Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), Compare())); } protected: @@ -129,13 +138,26 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 2, 3, 0, 1, 2, 3}, // indices {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, - {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 16.0f, 2.0f}}, + {1.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 5.0f, + 50.0f, + 28.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 16.0f, + 2.0f}}, }; typedef CSRToDenseTest CSRToDenseTestF; TEST_P(CSRToDenseTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index 8983f10d2b..3380eaa6fb 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -47,18 +47,16 @@ struct CSRTransposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRTransposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs& dims) +{ return os; } template -class CSRTransposeTest - : public ::testing::TestWithParam> { +class CSRTransposeTest : public ::testing::TestWithParam> { public: CSRTransposeTest() - : params(::testing::TestWithParam< - CSRTransposeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(raft_handle.get_stream()), indptr(0, stream), indices(0, stream), @@ -68,7 +66,8 @@ class CSRTransposeTest out_data_ref(0, stream), out_indptr(0, stream), out_indices(0, stream), - out_data(0, stream) { + out_data(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -81,50 +80,60 @@ class CSRTransposeTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; - - update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref.data(), out_data_ref_h.data(), - out_data_ref_h.size(), stream); + std::vector out_data_ref_h = params.out_data_ref_h; + + update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - raft::sparse::linalg::csr_transpose( - handle, indptr.data(), indices.data(), data.data(), out_indptr.data(), - out_indices.data(), out_data.data(), params.nrows, params.ncols, - params.nnz, stream); + raft::sparse::linalg::csr_transpose(handle, + indptr.data(), + indices.data(), + data.data(), + out_indptr.data(), + out_indices.data(), + out_data.data(), + params.nrows, + params.ncols, + params.nnz, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_indptr.data(), + out_indptr_ref.data(), params.out_indptr_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(), + ASSERT_TRUE(devArrMatch(out_indices.data(), + out_indices_ref.data(), params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(), - params.out_data_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare())); } protected: @@ -163,8 +172,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRTransposeTest CSRTransposeTestF; TEST_P(CSRTransposeTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index fbadadb29d..8b1c7988d6 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -33,8 +33,7 @@ struct SparseDegreeInputs { }; template -class SparseDegreeTests - : public ::testing::TestWithParam> { +class SparseDegreeTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -47,20 +46,19 @@ class SparseDegreeTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; -TEST_P(COODegree, Result) { +TEST_P(COODegree, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int in_rows_h[5] = {0, 0, 1, 2, 2}; - int verify_h[5] = {2, 1, 2, 0, 0}; + int verify_h[5] = {2, 1, 2, 0, 0}; rmm::device_uvector in_rows(5, stream); rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); - CUDA_CHECK( - cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); @@ -68,50 +66,43 @@ TEST_P(COODegree, Result) { linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream); cudaDeviceSynchronize(); - ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; -TEST_P(COODegreeNonzero, Result) { +TEST_P(COODegreeNonzero, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - int in_rows_h[5] = {0, 0, 1, 2, 2}; + int in_rows_h[5] = {0, 0, 1, 2, 2}; float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; - int verify_h[5] = {1, 0, 2, 0, 0}; + int verify_h[5] = {1, 0, 2, 0, 0}; rmm::device_uvector in_rows(5, stream); rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); rmm::device_uvector in_vals(5, stream); - CUDA_CHECK( - cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); raft::update_device(in_vals.data(), *&in_vals_h, 5, stream); - linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, - results.data(), stream); + linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, results.data(), stream); cudaDeviceSynchronize(); - ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); } -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index d24199c5fc..000d58d029 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -55,28 +55,26 @@ struct InputConfiguration { }; using dense_smem_strategy_t = detail::dense_smem_strategy; -using hash_strategy_t = detail::hash_strategy; +using hash_strategy_t = detail::hash_strategy; template struct SparseDistanceCOOSPMVInputs { InputConfiguration input_configuration; float capacity_threshold = 0.5; - int map_size = - detail::hash_strategy::get_map_size(); + int map_size = detail::hash_strategy::get_map_size(); }; template -::std::ostream &operator<<( - ::std::ostream &os, - const SparseDistanceCOOSPMVInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseDistanceCOOSPMVInputs& dims) +{ return os; } template class SparseDistanceCOOSPMVTest - : public ::testing::TestWithParam< - SparseDistanceCOOSPMVInputs> { + : public ::testing::TestWithParam> { public: SparseDistanceCOOSPMVTest() : dist_config(handle), @@ -84,62 +82,74 @@ class SparseDistanceCOOSPMVTest indices(0, handle.get_stream()), data(0, handle.get_stream()), out_dists(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config, params.capacity_threshold, params.map_size); } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config); } template - void compute_dist(reduce_f reduce_func, accum_f accum_func, - write_f write_func, bool rev = true) { - rmm::device_uvector coo_rows( - max(dist_config.b_nnz, dist_config.a_nnz), - dist_config.handle.get_stream()); - - raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, - coo_rows.data(), dist_config.b_nnz, + void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) + { + rmm::device_uvector coo_rows(max(dist_config.b_nnz, dist_config.a_nnz), + dist_config.handle.get_stream()); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, + dist_config.b_nrows, + coo_rows.data(), + dist_config.b_nnz, dist_config.handle.get_stream()); strategy_t selected_strategy = make_strategy(); - detail::balanced_coo_pairwise_generalized_spmv( - out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + detail::balanced_coo_pairwise_generalized_spmv(out_dists.data(), + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); if (rev) { - raft::sparse::convert::csr_to_coo( - dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(), - dist_config.a_nnz, dist_config.handle.get_stream()); - - detail::balanced_coo_pairwise_generalized_spmv_rev( - out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + raft::sparse::convert::csr_to_coo(dist_config.a_indptr, + dist_config.a_nrows, + coo_rows.data(), + dist_config.a_nnz, + dist_config.handle.get_stream()); + + detail::balanced_coo_pairwise_generalized_spmv_rev(out_dists.data(), + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); } } - void run_spmv() { + void run_spmv() + { switch (params.input_configuration.metric) { case raft::distance::DistanceType::InnerProduct: - compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), - true); + compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), true); break; case raft::distance::DistanceType::L2Unexpanded: compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd()); break; case raft::distance::DistanceType::Canberra: compute_dist( - [] __device__(value_t a, value_t b) { - return fabsf(a - b) / (fabsf(a) + fabsf(b)); - }, - detail::Sum(), detail::AtomicAdd()); + [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); }, + detail::Sum(), + detail::AtomicAdd()); break; case raft::distance::DistanceType::L1: compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd()); @@ -148,26 +158,27 @@ class SparseDistanceCOOSPMVTest compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax()); break; case raft::distance::DistanceType::LpUnexpanded: { - compute_dist(detail::PDiff(params.input_configuration.metric_arg), - detail::Sum(), detail::AtomicAdd()); + compute_dist( + detail::PDiff(params.input_configuration.metric_arg), detail::Sum(), detail::AtomicAdd()); float p = 1.0f / params.input_configuration.metric_arg; raft::linalg::unaryOp( - out_dists.data(), out_dists.data(), + out_dists.data(), + out_dists.data(), dist_config.a_nrows * dist_config.b_nrows, [=] __device__(value_t input) { return powf(input, p); }, dist_config.handle.get_stream()); } break; - default: - throw raft::exception("Unknown distance"); + default: throw raft::exception("Unknown distance"); } } protected: - void make_data() { - std::vector indptr_h = params.input_configuration.indptr_h; + void make_data() + { + std::vector indptr_h = params.input_configuration.indptr_h; std::vector indices_h = params.input_configuration.indices_h; - std::vector data_h = params.input_configuration.data_h; + std::vector data_h = params.input_configuration.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -178,33 +189,32 @@ class SparseDistanceCOOSPMVTest update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_dists_ref_h = - params.input_configuration.out_dists_ref_h; + std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.b_ncols = params.input_configuration.n_cols; - dist_config.b_nnz = params.input_configuration.indices_h.size(); - dist_config.b_indptr = indptr.data(); + dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.b_ncols = params.input_configuration.n_cols; + dist_config.b_nnz = params.input_configuration.indices_h.size(); + dist_config.b_indptr = indptr.data(); dist_config.b_indices = indices.data(); - dist_config.b_data = data.data(); - dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.a_ncols = params.input_configuration.n_cols; - dist_config.a_nnz = params.input_configuration.indices_h.size(); - dist_config.a_indptr = indptr.data(); + dist_config.b_data = data.data(); + dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.a_ncols = params.input_configuration.n_cols; + dist_config.a_nnz = params.input_configuration.indices_h.size(); + dist_config.a_indptr = indptr.data(); dist_config.a_indices = indices.data(); - dist_config.a_data = data.data(); + dist_config.a_data = data.data(); int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -215,8 +225,10 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), params.input_configuration.out_dists_ref_h.size(), CompareApprox(1e-3))); } @@ -241,8 +253,7 @@ const InputConfiguration input_inner_product = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}; @@ -273,384 +284,379 @@ const InputConfiguration input_l2_unexpanded = { raft::distance::DistanceType::L2Unexpanded, 0.0}; -const InputConfiguration input_canberra = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 3.3954660629919076, - 5.6469232737388815, - 6.373112846266441, - 4.0212880272531715, - 6.916281504639404, - 5.741508386786526, - 5.411470999663036, - 9.0, - 4.977014354725805, - 3.3954660629919076, - 0.0, - 7.56256082439209, - 5.540261147481582, - 4.832322929216881, - 4.62003193872216, - 6.498056792320361, - 4.309846252268695, - 6.317531174829905, - 6.016362684141827, - 5.6469232737388815, - 7.56256082439209, - 0.0, - 5.974878731322299, - 4.898357301336036, - 6.442097410320605, - 5.227077347287883, - 7.134101195584642, - 5.457753923371659, - 7.0, - 6.373112846266441, - 5.540261147481582, - 5.974878731322299, - 0.0, - 5.5507273748583, - 4.897749658726415, - 9.0, - 8.398776718824767, - 3.908281400328807, - 4.83431066343688, - 4.0212880272531715, - 4.832322929216881, - 4.898357301336036, - 5.5507273748583, - 0.0, - 6.632989819428174, - 7.438852294822894, - 5.6631570310967465, - 7.579428202635459, - 6.760811985364303, - 6.916281504639404, - 4.62003193872216, - 6.442097410320605, - 4.897749658726415, - 6.632989819428174, - 0.0, - 5.249404187382862, - 6.072559523278559, - 4.07661278488929, - 6.19678948003145, - 5.741508386786526, - 6.498056792320361, - 5.227077347287883, - 9.0, - 7.438852294822894, - 5.249404187382862, - 0.0, - 3.854811639654704, - 6.652724827169063, - 5.298236851430971, - 5.411470999663036, - 4.309846252268695, - 7.134101195584642, - 8.398776718824767, - 5.6631570310967465, - 6.072559523278559, - 3.854811639654704, - 0.0, - 7.529184598969917, - 6.903282911791188, - 9.0, - 6.317531174829905, - 5.457753923371659, - 3.908281400328807, - 7.579428202635459, - 4.07661278488929, - 6.652724827169063, - 7.529184598969917, - 0.0, - 7.0, - 4.977014354725805, - 6.016362684141827, - 7.0, - 4.83431066343688, - 6.760811985364303, - 6.19678948003145, - 5.298236851430971, - 6.903282911791188, - 7.0, - 0.0}, - raft::distance::DistanceType::Canberra, - 0.0}; - -const InputConfiguration input_lp_unexpanded = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 1.31462855332296, - 1.3690307816129905, - 1.698603990921237, - 1.3460470789553531, - 1.6636670712582544, - 1.2651744044972217, - 1.1938329352055201, - 1.8811409082590185, - 1.3653115050624267, - 1.31462855332296, - 0.0, - 1.9447722703291133, - 1.42818777206562, - 1.4685491458946494, - 1.3071999866010466, - 1.4988622861692171, - 0.9698559287406783, - 1.4972023224597841, - 1.5243383567266802, - 1.3690307816129905, - 1.9447722703291133, - 0.0, - 1.2748400840107568, - 1.0599569946448246, - 1.546591282841402, - 1.147526531928459, - 1.447002179128145, - 1.5982242387673176, - 1.3112533607072414, - 1.698603990921237, - 1.42818777206562, - 1.2748400840107568, - 0.0, - 1.038121552545461, - 1.011788365364402, - 1.3907391109256988, - 1.3128200942311496, - 1.19595706584447, - 1.3233328139624725, - 1.3460470789553531, - 1.4685491458946494, - 1.0599569946448246, - 1.038121552545461, - 0.0, - 1.3642741698145529, - 1.3493868683808095, - 1.394942694628328, - 1.572881849642552, - 1.380122665319464, - 1.6636670712582544, - 1.3071999866010466, - 1.546591282841402, - 1.011788365364402, - 1.3642741698145529, - 0.0, - 1.018961640373018, - 1.0114394258945634, - 0.8338711034820684, - 1.1247823842299223, - 1.2651744044972217, - 1.4988622861692171, - 1.147526531928459, - 1.3907391109256988, - 1.3493868683808095, - 1.018961640373018, - 0.0, - 0.7701238110357329, - 1.245486437864406, - 0.5551259549534626, - 1.1938329352055201, - 0.9698559287406783, - 1.447002179128145, - 1.3128200942311496, - 1.394942694628328, - 1.0114394258945634, - 0.7701238110357329, - 0.0, - 1.1886800117391216, - 1.0083692448135637, - 1.8811409082590185, - 1.4972023224597841, - 1.5982242387673176, - 1.19595706584447, - 1.572881849642552, - 0.8338711034820684, - 1.245486437864406, - 1.1886800117391216, - 0.0, - 1.3661374102525012, - 1.3653115050624267, - 1.5243383567266802, - 1.3112533607072414, - 1.3233328139624725, - 1.380122665319464, - 1.1247823842299223, - 0.5551259549534626, - 1.0083692448135637, - 1.3661374102525012, - 0.0}, - raft::distance::DistanceType::LpUnexpanded, - 2.0}; - -const InputConfiguration input_linf = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 0.9251771844789913, - 0.9036452083899731, - 0.9251771844789913, - 0.8706483735804971, - 0.9251771844789913, - 0.717493881903289, - 0.6920214832303888, - 0.9251771844789913, - 0.9251771844789913, - 0.9251771844789913, - 0.0, - 0.9036452083899731, - 0.8655339692155823, - 0.8706483735804971, - 0.8655339692155823, - 0.8655339692155823, - 0.6329837991017668, - 0.8655339692155823, - 0.8655339692155823, - 0.9036452083899731, - 0.9036452083899731, - 0.0, - 0.7988276152181608, - 0.7028075145996631, - 0.9036452083899731, - 0.9036452083899731, - 0.9036452083899731, - 0.8429599432532096, - 0.9036452083899731, - 0.9251771844789913, - 0.8655339692155823, - 0.7988276152181608, - 0.0, - 0.48376552205293305, - 0.8206394616536681, - 0.8206394616536681, - 0.8206394616536681, - 0.8429599432532096, - 0.8206394616536681, - 0.8706483735804971, - 0.8706483735804971, - 0.7028075145996631, - 0.48376552205293305, - 0.0, - 0.8706483735804971, - 0.8706483735804971, - 0.8706483735804971, - 0.8429599432532096, - 0.8706483735804971, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.0, - 0.8853924473642432, - 0.535821510936138, - 0.6497196601457607, - 0.8853924473642432, - 0.717493881903289, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.0, - 0.5279604218147174, - 0.6658348373853169, - 0.33799874888632914, - 0.6920214832303888, - 0.6329837991017668, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.535821510936138, - 0.5279604218147174, - 0.0, - 0.662579808115858, - 0.5079750812968089, - 0.9251771844789913, - 0.8655339692155823, - 0.8429599432532096, - 0.8429599432532096, - 0.8429599432532096, - 0.6497196601457607, - 0.6658348373853169, - 0.662579808115858, - 0.0, - 0.8429599432532096, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.33799874888632914, - 0.5079750812968089, - 0.8429599432532096, - 0.0}, - raft::distance::DistanceType::Linf, - 0.0}; - -const InputConfiguration input_l1 = { - 4, - {0, 1, 1, 2, 4}, - {3, 2, 0, 1}, // indices - {0.99296, 0.42180, 0.11687, 0.305869}, - { - // dense output - 0.0, - 0.99296, - 1.41476, - 1.415707, - 0.99296, - 0.0, - 0.42180, - 0.42274, - 1.41476, - 0.42180, - 0.0, - 0.84454, - 1.41570, - 0.42274, - 0.84454, - 0.0, - }, - raft::distance::DistanceType::L1, +const InputConfiguration input_canberra = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra, 0.0}; +const InputConfiguration input_lp_unexpanded = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}; + +const InputConfiguration input_linf = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf, + 0.0}; + +const InputConfiguration input_l1 = {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1, + 0.0}; + // test dense smem strategy -const std::vector< - SparseDistanceCOOSPMVInputs> - inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded}, - {input_canberra}, {input_lp_unexpanded}, - {input_linf}, {input_l1}}; +const std::vector> + inputs_dense_strategy = {{input_inner_product}, + {input_l2_unexpanded}, + {input_canberra}, + {input_lp_unexpanded}, + {input_linf}, + {input_l1}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestDenseStrategyF; @@ -660,22 +666,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, ::testing::ValuesIn(inputs_dense_strategy)); // test hash and chunk strategy -const std::vector> - inputs_hash_strategy = {{input_inner_product}, - {input_inner_product, 0.5, 2}, - {input_l2_unexpanded}, - {input_l2_unexpanded, 0.5, 2}, - {input_canberra}, - {input_canberra, 0.5, 2}, - {input_canberra, 0.5, 6}, - {input_lp_unexpanded}, - {input_lp_unexpanded, 0.5, 2}, - {input_lp_unexpanded, 0.5, 6}, - {input_linf}, - {input_linf, 0.5, 2}, - {input_linf, 0.5, 6}, - {input_l1}, - {input_l1, 0.5, 2}}; +const std::vector> inputs_hash_strategy = { + {input_inner_product}, + {input_inner_product, 0.5, 2}, + {input_l2_unexpanded}, + {input_l2_unexpanded, 0.5, 2}, + {input_canberra}, + {input_canberra, 0.5, 2}, + {input_canberra, 0.5, 6}, + {input_lp_unexpanded}, + {input_lp_unexpanded, 0.5, 2}, + {input_lp_unexpanded, 0.5, 6}, + {input_linf}, + {input_linf, 0.5, 2}, + {input_linf, 0.5, 6}, + {input_l1}, + {input_l1, 0.5, 2}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestHashStrategyF; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 3bc562bb68..8538c9cf39 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -49,8 +49,8 @@ struct SparseDistanceInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseDistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ return os; } @@ -59,52 +59,56 @@ class SparseDistanceTest : public ::testing::TestWithParam> { public: SparseDistanceTest() - : params(::testing::TestWithParam< - SparseDistanceInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), dist_config(handle), indptr(0, handle.get_stream()), indices(0, handle.get_stream()), data(0, handle.get_stream()), out_dists(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } - void SetUp() override { + void SetUp() override + { make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr.data(); + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr.data(); dist_config.b_indices = indices.data(); - dist_config.b_data = data.data(); - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr.data(); + dist_config.b_data = data.data(); + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr.data(); dist_config.a_indices = indices.data(); - dist_config.a_data = data.data(); + dist_config.a_data = data.data(); int out_size = dist_config.a_nrows * dist_config.b_nrows; out_dists.resize(out_size, handle.get_stream()); - pairwiseDistance(out_dists.data(), dist_config, params.metric, - params.metric_arg); + pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), params.out_dists_ref_h.size(), CompareApprox(1e-3))); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -119,8 +123,10 @@ class SparseDistanceTest out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), dist_config.handle.get_stream()); + update_device(out_dists_ref.data(), + out_dists_ref_h.data(), + out_dists_ref_h.size(), + dist_config.handle.get_stream()); } raft::handle_t handle; @@ -182,8 +188,7 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}, {2, @@ -214,40 +219,33 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, - 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., - 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, - 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, - 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., - 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, - 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, - 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, - 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, - 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, - 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, - 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, - 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., - 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, - 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., - 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, 0.61547536, 0.68185144, 1., 0.}, raft::distance::DistanceType::CosineExpanded, 0.0}, {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, @@ -356,15 +354,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 3.3954660629919076, 5.6469232737388815, @@ -470,15 +466,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 1.31462855332296, 1.3690307816129905, @@ -584,15 +578,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 0.9251771844789913, 0.9036452083899731, @@ -698,17 +690,14 @@ const std::vector> inputs_i32_f = { {15, {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, - {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, - 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, - 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, - {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, - 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, - 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, - 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, - 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, - 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, - 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, - 0.26190054, 0.2077349, 0.10803964}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, @@ -767,31 +756,25 @@ const std::vector> inputs_i32_f = { {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, - {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, - 0.99584118, 0.76835667, 0.34426657, 0.2357925, 0.01274851, 0.11422017, - 0.3437756, 0.31967718, 0.5956055, 0.31610373, 0.04147273, 0.03724415, - 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, - 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, - 0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608, - 0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295, - 0.27366735, 0.5119944, 0.11416134}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, {// dense output - 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, - 0.76962708, 1.122858, 1.1232498, 1.08166081, 0.48769777, 0., - 1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055, - 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, - 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, - 0.26127048, 0.98318907, 1.82943642, 0., 0.29945563, 1.08494093, - 0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815, - 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, - 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, - 0.45060069, 0., 1.29899154, 1.40683824, 0.48505269, 0.53862363, - 0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154, - 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, - 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., - 1.47318624, 1.92660889, 1.1232498, 0.40658897, 0.60215168, 1.74288748, - 1.18328348, 0.48505269, 1.92108999, 1.47318624, 0., 0.24992619, - 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, 1.88812175, 1.92660889, 0.24992619, 0.}, raft::distance::DistanceType::CorrelationExpanded, 0.0}, @@ -800,12 +783,11 @@ const std::vector> inputs_i32_f = { {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., - 1., 1., 1., 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., - 1., 1., 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., - 1., 1., 1., 1., 0., 1., 0.8, 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., - 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, raft::distance::DistanceType::RusselRaoExpanded, 0.0}, {5, @@ -813,13 +795,12 @@ const std::vector> inputs_i32_f = { {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, - 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., - 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., - 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, - 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0., 0.2, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, raft::distance::DistanceType::HammingUnexpanded, 0.0}, {3, @@ -863,7 +844,8 @@ const std::vector> inputs_i32_f = { typedef SparseDistanceTest SparseDistanceTestF; TEST_P(SparseDistanceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // namespace distance diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index 58ad9cf803..63245a63b0 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -35,8 +35,7 @@ struct SparseFilterInputs { }; template -class SparseFilterTests - : public ::testing::TestWithParam> { +class SparseFilterTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -49,12 +48,13 @@ class SparseFilterTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseFilterTests COORemoveZeros; -TEST_P(COORemoveZeros, Result) { +TEST_P(COORemoveZeros, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); params = ::testing::TestWithParam>::GetParam(); - float *in_h_vals = new float[params.nnz]; + float* in_h_vals = new float[params.nnz]; COO in(stream, params.nnz, 5, 5); @@ -67,8 +67,8 @@ TEST_P(COORemoveZeros, Result) { in_h_vals[2] = 0; in_h_vals[3] = 0; - int *in_h_rows = new int[params.nnz]; - int *in_h_cols = new int[params.nnz]; + int* in_h_rows = new int[params.nnz]; + int* in_h_cols = new int[params.nnz]; for (int i = 0; i < params.nnz; i++) { in_h_rows[i] = params.nnz - i - 1; @@ -84,9 +84,9 @@ TEST_P(COORemoveZeros, Result) { int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; - float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); - out_vals_ref_h[0] = in_h_vals[4]; - out_vals_ref_h[1] = in_h_vals[1]; + float* out_vals_ref_h = (float*)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; COO out_ref(stream, 2, 5, 5); COO out(stream); @@ -97,12 +97,9 @@ TEST_P(COORemoveZeros, Result) { op::coo_remove_zeros<32, float>(&in, &out, stream); - ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); free(out_vals_ref_h); @@ -112,8 +109,7 @@ TEST_P(COORemoveZeros, Result) { delete[] in_h_vals; } -INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 86b3b3d382..a693262193 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -48,60 +48,76 @@ struct SparseKNNInputs { int batch_size_index = 2; int batch_size_query = 2; - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded; }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseKNNInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ return os; } template -class SparseKNNTest - : public ::testing::TestWithParam> { +class SparseKNNTest : public ::testing::TestWithParam> { public: SparseKNNTest() - : params(::testing::TestWithParam< - SparseKNNInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), indptr(0, handle.get_stream()), indices(0, handle.get_stream()), data(0, handle.get_stream()), out_indices(0, handle.get_stream()), out_dists(0, handle.get_stream()), out_indices_ref(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } protected: - void SetUp() override { + void SetUp() override + { n_rows = params.indptr_h.size() - 1; - nnz = params.indices_h.size(); - k = params.k; + nnz = params.indices_h.size(); + k = params.k; make_data(); - raft::sparse::selection::brute_force_knn( - indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols, - indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols, - out_indices.data(), out_dists.data(), k, handle, params.batch_size_index, - params.batch_size_query, params.metric); + raft::sparse::selection::brute_force_knn(indptr.data(), + indices.data(), + data.data(), + nnz, + n_rows, + params.n_cols, + indptr.data(), + indices.data(), + data.data(), + nnz, + n_rows, + params.n_cols, + out_indices.data(), + out_dists.data(), + k, + handle, + params.batch_size_index, + params.batch_size_query, + params.metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, - CompareApprox(1e-4))); - ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(), - n_rows * k, Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -112,16 +128,15 @@ class SparseKNNTest update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; out_indices_ref.resize(out_indices_ref_h.size(), stream); out_dists_ref.resize(out_dists_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); out_dists.resize(n_rows * k, stream); out_indices.resize(n_rows * k, stream); @@ -158,8 +173,7 @@ const std::vector> inputs_i32_f = { raft::distance::DistanceType::L2SqrtExpanded}}; typedef SparseKNNTest SparseKNNTestF; TEST_P(SparseKNNTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection }; // end namespace sparse diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index c2a1c4b93c..1ed017f40a 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -30,8 +30,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -51,32 +52,31 @@ struct KNNGraphInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const KNNGraphInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs& dims) +{ return os; } template -class KNNGraphTest - : public ::testing::TestWithParam> { +class KNNGraphTest : public ::testing::TestWithParam> { public: KNNGraphTest() - : params(::testing::TestWithParam< - KNNGraphInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), - X(0, stream) { + X(0, stream) + { X.resize(params.X.size(), stream); } protected: - void SetUp() override { + void SetUp() override + { out = new raft::sparse::COO(stream); update_device(X.data(), params.X.data(), params.X.size(), stream); raft::sparse::selection::knn_graph( - handle, X.data(), params.m, params.n, - raft::distance::DistanceType::L2Unexpanded, *out); + handle, X.data(), params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); rmm::device_scalar sum(stream); sum.set_value_to_zero_async(stream); @@ -98,7 +98,7 @@ class KNNGraphTest cudaStream_t stream; // input data - raft::sparse::COO *out; + raft::sparse::COO* out; rmm::device_uvector X; @@ -112,13 +112,15 @@ const std::vector> knn_graph_inputs_fint = { {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}}; typedef KNNGraphTest KNNGraphTestF_int; -TEST_P(KNNGraphTestF_int, Result) { +TEST_P(KNNGraphTestF_int, Result) +{ // nnz should not be larger than twice m * k ASSERT_TRUE(out->nnz <= (params.m * params.k * 2)); ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int, +INSTANTIATE_TEST_CASE_P(KNNGraphTest, + KNNGraphTestF_int, ::testing::ValuesIn(knn_graph_inputs_fint)); } // namespace sparse diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 6d4af7f016..50401e5b7a 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -55,45 +55,44 @@ struct LinkageInputs { * @param b: number of pairs of points that both the clusters have classified differently */ template -__global__ void computeTheNumerator(const T* firstClusterArray, - const T* secondClusterArray, uint64_t size, - uint64_t* a, uint64_t* b) { - //calculating the indices of pairs of datapoints compared by the current thread +__global__ void computeTheNumerator( + const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) +{ + // calculating the indices of pairs of datapoints compared by the current thread uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - //thread-local variables to count a and b + // thread-local variables to count a and b uint64_t myA = 0, myB = 0; if (i < size && j < size && j < i) { - //checking if the pair have been classified the same by both the clusters + // checking if the pair have been classified the same by both the clusters if (firstClusterArray[i] == firstClusterArray[j] && secondClusterArray[i] == secondClusterArray[j]) { ++myA; } - //checking if the pair have been classified differently by both the clusters + // checking if the pair have been classified differently by both the clusters else if (firstClusterArray[i] != firstClusterArray[j] && secondClusterArray[i] != secondClusterArray[j]) { ++myB; } } - //specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce + // specialize blockReduce for a 2D block of 1024 threads of type uint64_t + typedef cub::BlockReduce BlockReduce; - //Allocate shared memory for blockReduce + // Allocate shared memory for blockReduce __shared__ typename BlockReduce::TempStorage temp_storage; - //summing up thread-local counts specific to a block + // summing up thread-local counts specific to a block myA = BlockReduce(temp_storage).Sum(myA); __syncthreads(); myB = BlockReduce(temp_storage).Sum(myB); __syncthreads(); - //executed once per block + // executed once per block if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd((unsigned long long int*)a, myA); raft::myAtomicAdd((unsigned long long int*)b, myB); @@ -101,53 +100,54 @@ __global__ void computeTheNumerator(const T* firstClusterArray, } /** -* @brief Function to calculate RandIndex -* more info on rand index -* @param firstClusterArray: the array of classes of type T -* @param secondClusterArray: the array of classes of type T -* @param size: the size of the data points of type uint64_t -* @param stream: the cudaStream object -*/ + * @brief Function to calculate RandIndex + * more info on rand index + * @param firstClusterArray: the array of classes of type T + * @param secondClusterArray: the array of classes of type T + * @param size: the size of the data points of type uint64_t + * @param stream: the cudaStream object + */ template -double compute_rand_index(T* firstClusterArray, T* secondClusterArray, - uint64_t size, cudaStream_t stream) { - //rand index for size less than 2 is not defined +double compute_rand_index(T* firstClusterArray, + T* secondClusterArray, + uint64_t size, + cudaStream_t stream) +{ + // rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - //allocating and initializing memory for a and b in the GPU + // allocating and initializing memory for a and b in the GPU rmm::device_uvector arr_buf(2, stream); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - //kernel configuration + // kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), raft::ceildiv(size, numThreadsPerBlock.y)); - //calling the kernel - computeTheNumerator - <<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), - arr_buf.data() + 1); + // calling the kernel + computeTheNumerator<<>>( + firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - //synchronizing and updating the calculated values of a and b from device to host + // synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - //error handling + // error handling CUDA_CHECK(cudaGetLastError()); - //denominator + // denominator uint64_t nChooseTwo = size * (size - 1) / 2; - //calculating the rand_index + // calculating the rand_index return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); } template -::std::ostream& operator<<(::std::ostream& os, - const LinkageInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const LinkageInputs& dims) +{ return os; } @@ -158,15 +158,17 @@ class LinkageTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), labels(params.n_row, stream), - labels_ref(params.n_row, stream) {} + labels_ref(params.n_row, stream) + { + } protected: - void basicTest() { + void basicTest() + { rmm::device_uvector data(params.n_row * params.n_col, stream); raft::copy(data.data(), params.data.data(), data.size(), stream); - raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, - stream); + raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, stream); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels.data(); @@ -176,16 +178,19 @@ class LinkageTest : public ::testing::TestWithParam> { out_arrs.children = out_children.data(); raft::handle_t handle; - raft::hierarchy::single_linkage< - IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c, + raft::hierarchy::single_linkage( + handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + &out_arrs, + params.c, params.n_clusters); CUDA_CHECK(cudaStreamSynchronize(stream)); - score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, - stream); + score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream); } void SetUp() override { basicTest(); } @@ -203,14 +208,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10, @@ -218,8 +221,7 @@ const std::vector> linkage_inputsf2 = { // // Test outlier points {9, 2, - {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, - 10, 50, 30, 5}, + {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5}, {6, 0, 5, 0, 0, 4, 3, 2, 1}, 7, -1}, @@ -227,14 +229,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == (n_points / 2) {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {1, 0, 4, 0, 0, 3, 2, 0, 2, 1}, 5, @@ -243,340 +243,173 @@ const std::vector> linkage_inputsf2 = { // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -589,6 +422,5 @@ const std::vector> linkage_inputsf2 = { typedef LinkageTest LinkageTestF_Int; TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, - ::testing::ValuesIn(linkage_inputsf2)); +INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2)); } // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 4900b3ff2b..3cf465e032 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -39,24 +39,25 @@ struct CSRRowNormalizeInputs { }; template -class CSRRowNormalizeTest - : public ::testing::TestWithParam> { +class CSRRowNormalizeTest : public ::testing::TestWithParam> { public: CSRRowNormalizeTest() - : params(::testing::TestWithParam< - CSRRowNormalizeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in_vals(params.in_vals.size(), stream), verify(params.verify.size(), stream), ex_scan(params.ex_scan.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: void SetUp() override {} - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.in_vals.size(); + Index_ nnz = params.in_vals.size(); raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream); @@ -73,8 +74,8 @@ class CSRRowNormalizeTest break; } - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -113,9 +114,11 @@ const std::vector> csrnormalize_inputs_d = { {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestF, ::testing::ValuesIn(csrnormalize_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestD, ::testing::ValuesIn(csrnormalize_inputs_d)); } // namespace sparse diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 8ff4a600bc..9a27ae5134 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -42,15 +42,15 @@ struct SparseReduceInputs { }; template -class SparseReduceTest - : public ::testing::TestWithParam> { +class SparseReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - SparseReduceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); } - void Run() { + void Run() + { raft::handle_t handle; auto stream = handle.get_stream(); @@ -62,30 +62,29 @@ class SparseReduceTest rmm::device_uvector out_cols(params.out_cols.size(), stream); rmm::device_uvector out_vals(params.out_vals.size(), stream); - raft::update_device(in_rows.data(), params.in_rows.data(), - params.in_rows.size(), stream); - raft::update_device(in_cols.data(), params.in_cols.data(), - params.in_cols.size(), stream); - raft::update_device(in_vals.data(), params.in_vals.data(), - params.in_vals.size(), stream); - raft::update_device(out_rows.data(), params.out_rows.data(), - params.out_rows.size(), stream); - raft::update_device(out_cols.data(), params.out_cols.data(), - params.out_cols.size(), stream); - raft::update_device(out_vals.data(), params.out_vals.data(), - params.out_vals.size(), stream); + raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream); + raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream); + raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream); + raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream); + raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream); + raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); raft::sparse::COO out(stream); - raft::sparse::op::max_duplicates(handle, out, in_rows.data(), - in_cols.data(), in_vals.data(), - params.in_rows.size(), params.m, params.n); + raft::sparse::op::max_duplicates(handle, + out, + in_rows.data(), + in_cols.data(), + in_vals.data(), + params.in_rows.size(), + params.m, + params.n); ASSERT_TRUE(raft::devArrMatch( out_rows.data(), out.rows(), out.nnz, raft::Compare())); ASSERT_TRUE(raft::devArrMatch( out_cols.data(), out.cols(), out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, raft::Compare())); } void TearDown() override {} @@ -114,7 +113,8 @@ const std::vector> max_reduce_inputs_f = { 4}, }; -INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF, +INSTANTIATE_TEST_CASE_P(SparseReduceTest, + SparseReduceTestF, ::testing::ValuesIn(max_reduce_inputs_f)); } // namespace sparse diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index d527e7323e..d73288b9f6 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -38,43 +38,48 @@ struct CSRRowOpInputs { /** Wrapper to call csr_row_op because the enclosing function of a __device__ * lambda cannot have private ot protected access within the class. */ template -void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, - Type_f *result, cudaStream_t stream) { +void csr_row_op_wrapper( + const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream) +{ op::csr_row_op( - row_ind, n_rows, nnz, + row_ind, + n_rows, + nnz, [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { - for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; + for (Index_ i = start_idx; i < stop_idx; i++) + result[i] = row; }, stream); } template -class CSRRowOpTest - : public ::testing::TestWithParam> { +class CSRRowOpTest : public ::testing::TestWithParam> { public: CSRRowOpTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), verify(params.verify.size(), stream), ex_scan(params.ex_scan.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: - void SetUp() override { + void SetUp() override + { n_rows = params.ex_scan.size(); - nnz = params.verify.size(); + nnz = params.verify.size(); } - void Run() { + void Run() + { raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - csr_row_op_wrapper(ex_scan.data(), n_rows, nnz, - result.data(), stream); + csr_row_op_wrapper(ex_scan.data(), n_rows, nnz, result.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -100,10 +105,8 @@ const std::vector> csrrowop_inputs_d = { {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, - ::testing::ValuesIn(csrrowop_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, - ::testing::ValuesIn(csrrowop_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index 7d43780cfd..c7cd03b485 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -46,7 +46,8 @@ class SparseSortTest : public ::testing::TestWithParam> { const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseSortTest COOSort; -TEST_P(COOSort, Result) { +TEST_P(COOSort, Result) +{ params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -59,13 +60,13 @@ TEST_P(COOSort, Result) { r.uniform(in_vals.data(), params.nnz, float(-1.0), float(1.0), stream); - int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); - int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); - int *verify_h = (int *)malloc(params.nnz * sizeof(int)); + int* in_rows_h = (int*)malloc(params.nnz * sizeof(int)); + int* in_cols_h = (int*)malloc(params.nnz * sizeof(int)); + int* verify_h = (int*)malloc(params.nnz * sizeof(int)); for (int i = 0; i < params.nnz; i++) { in_rows_h[i] = params.nnz - i - 1; - verify_h[i] = i; + verify_h[i] = i; in_cols_h[i] = i; } @@ -74,11 +75,11 @@ TEST_P(COOSort, Result) { raft::update_device(in_cols.data(), in_cols_h, params.nnz, stream); raft::update_device(verify.data(), verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), - in_vals.data(), stream); + op::coo_sort( + params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), in_vals.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), in_rows.data(), params.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), in_rows.data(), params.nnz, raft::Compare())); delete[] in_rows_h; delete[] in_cols_h; diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index 77d9d3d822..53bea0ddc0 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -31,8 +31,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -51,28 +52,31 @@ struct SparseSymmetrizeInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSymmetrizeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSymmetrizeInputs& dims) +{ return os; } template -class SparseSymmetrizeTest : public ::testing::TestWithParam< - SparseSymmetrizeInputs> { +class SparseSymmetrizeTest + : public ::testing::TestWithParam> { public: SparseSymmetrizeTest() - : params(::testing::TestWithParam< - SparseSymmetrizeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), indptr(0, stream), indices(0, stream), - data(0, stream) {} + data(0, stream) + { + } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; indptr.resize(indptr_h.size(), stream); indices.resize(indices_h.size(), stream); @@ -83,22 +87,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< update_device(data.data(), data_h.data(), data_h.size(), stream); } - void SetUp() override { + void SetUp() override + { make_data(); - value_idx m = params.indptr_h.size() - 1; - value_idx n = params.n_cols; + value_idx m = params.indptr_h.size() - 1; + value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); rmm::device_uvector coo_rows(nnz, stream); - raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, - stream); + raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, stream); raft::sparse::COO out(stream); - raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices.data(), - data.data(), m, n, coo_rows.size(), out); + raft::sparse::linalg::symmetrize( + handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out); rmm::device_scalar sum(stream); sum.set_value_to_zero_async(stream); @@ -130,8 +134,7 @@ struct COOSymmetrizeInputs { }; template -class COOSymmetrizeTest - : public ::testing::TestWithParam> { +class COOSymmetrizeTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -141,22 +144,21 @@ class COOSymmetrizeTest const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef COOSymmetrizeTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) { +TEST_P(COOSymmetrize, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int nnz = 8; - int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; - float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + int* in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; - int *exp_rows_h = - new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; - int *exp_cols_h = - new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; - float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, - 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float* exp_vals_h = + new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; COO in(stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); @@ -166,22 +168,18 @@ TEST_P(COOSymmetrize, Result) { COO out(stream); linalg::coo_symmetrize<32, float>( - &in, &out, - [] __device__(int row, int col, float val, float trans) { - return val + trans; - }, + &in, + &out, + [] __device__(int row, int col, float val, float trans) { return val + trans; }, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); - ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, raft::Compare())); cudaStreamDestroy(stream); @@ -194,8 +192,7 @@ TEST_P(COOSymmetrize, Result) { delete[] exp_vals_h; } -INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf)); const std::vector> symm_inputs_fint = { // Test n_clusters == n_points @@ -215,7 +212,8 @@ const std::vector> symm_inputs_fint = { typedef SparseSymmetrizeTest SparseSymmetrizeTestF_int; TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int, +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, + SparseSymmetrizeTestF_int, ::testing::ValuesIn(symm_inputs_fint)); } // namespace sparse diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index ca30506df0..ab85e7fe8f 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -37,21 +37,26 @@ namespace knn { using namespace std; template -__global__ void count_discrepancies_kernel(value_idx *actual_idx, - value_idx *expected_idx, - value_t *actual, value_t *expected, - uint32_t m, uint32_t n, - uint32_t *out, float thres = 1e-3) { +__global__ void count_discrepancies_kernel(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + float thres = 1e-3) +{ uint32_t row = blockDim.x * blockIdx.x + threadIdx.x; int n_diffs = 0; if (row < m) { for (uint32_t i = 0; i < n; i++) { - value_t d = actual[row * n + i] - expected[row * n + i]; + value_t d = actual[row * n + i] - expected[row * n + i]; bool matches = fabsf(d) <= thres; if (!matches) { // printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n", - // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], expected[row*n+i]); + // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], + // expected[row*n+i]); } n_diffs += !matches; @@ -61,13 +66,19 @@ __global__ void count_discrepancies_kernel(value_idx *actual_idx, } struct is_nonzero { - __host__ __device__ bool operator()(uint32_t &i) { return i > 0; } + __host__ __device__ bool operator()(uint32_t& i) { return i > 0; } }; template -uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx, - value_t *actual, value_t *expected, uint32_t m, - uint32_t n, uint32_t *out, cudaStream_t stream) { +uint32_t count_discrepancies(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + cudaStream_t stream) +{ uint32_t tpb = 256; count_discrepancies_kernel<<>>( actual_idx, expected_idx, actual, expected, m, n, out); @@ -79,25 +90,41 @@ uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx, } template -void compute_bfknn(const raft::handle_t &handle, const value_t *X1, - const value_t *X2, uint32_t n, uint32_t d, uint32_t k, - const raft::distance::DistanceType metric, value_t *dists, - int64_t *inds) { - std::vector input_vec = {const_cast(X1)}; +void compute_bfknn(const raft::handle_t& handle, + const value_t* X1, + const value_t* X2, + uint32_t n, + uint32_t d, + uint32_t k, + const raft::distance::DistanceType metric, + value_t* dists, + int64_t* inds) +{ + std::vector input_vec = {const_cast(X1)}; std::vector sizes_vec = {n}; - cudaStream_t *int_streams = nullptr; - std::vector *translations = nullptr; - - raft::spatial::knn::detail::brute_force_knn_impl( - input_vec, sizes_vec, d, const_cast(X2), n, inds, dists, k, - handle.get_stream(), int_streams, 0, true, true, translations, metric); + cudaStream_t* int_streams = nullptr; + std::vector* translations = nullptr; + + raft::spatial::knn::detail::brute_force_knn_impl(input_vec, + sizes_vec, + d, + const_cast(X2), + n, + inds, + dists, + k, + handle.get_stream(), + int_streams, + 0, + true, + true, + translations, + metric); } struct ToRadians { - __device__ __host__ float operator()(float a) { - return a * (CUDART_PI_F / 180.0); - } + __device__ __host__ float operator()(float a) { return a * (CUDART_PI_F / 180.0); } }; struct BallCoverInputs { @@ -109,13 +136,14 @@ struct BallCoverInputs { template class BallCoverKNNQueryTest : public ::testing::TestWithParam { protected: - void basicTest() { + void basicTest() + { params = ::testing::TestWithParam::GetParam(); raft::handle_t handle; - uint32_t k = params.k; + uint32_t k = params.k; float weight = params.weight; - auto metric = params.metric; + auto metric = params.metric; std::vector h_train_inputs = spatial_data; @@ -126,17 +154,25 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { // Allocate input rmm::device_uvector d_train_inputs(n * d, handle.get_stream()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, - handle.get_stream()); + raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream()); if (metric == raft::distance::DistanceType::Haversine) { - thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(), + thrust::transform(handle.get_thrust_policy(), + d_train_inputs.data(), d_train_inputs.data() + d_train_inputs.size(), - d_train_inputs.data(), ToRadians()); + d_train_inputs.data(), + ToRadians()); } - compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k, - metric, d_ref_D.data(), d_ref_I.data()); + compute_bfknn(handle, + d_train_inputs.data(), + d_train_inputs.data(), + n, + d, + k, + metric, + d_ref_D.data(), + d_ref_I.data()); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); @@ -144,13 +180,11 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { rmm::device_uvector d_pred_I(n * k, handle.get_stream()); rmm::device_uvector d_pred_D(n * k, handle.get_stream()); - BallCoverIndex index(handle, d_train_inputs.data(), n, - d, metric); + BallCoverIndex index(handle, d_train_inputs.data(), n, d, metric); raft::spatial::knn::rbc_build_index(handle, index); - raft::spatial::knn::rbc_knn_query(handle, index, k, d_train_inputs.data(), - n, d_pred_I.data(), d_pred_D.data(), true, - weight); + raft::spatial::knn::rbc_knn_query( + handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The @@ -158,12 +192,19 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { // can be nondeterministic. rmm::device_uvector discrepancies(n, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), discrepancies.data(), - discrepancies.data() + discrepancies.size(), 0); + thrust::fill(handle.get_thrust_policy(), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); // - int res = count_discrepancies(d_ref_I.data(), d_pred_I.data(), - d_ref_D.data(), d_pred_D.data(), n, k, - discrepancies.data(), handle.get_stream()); + int res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + n, + k, + discrepancies.data(), + handle.get_stream()); ASSERT_TRUE(res == 0); } @@ -180,13 +221,14 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { template class BallCoverAllKNNTest : public ::testing::TestWithParam { protected: - void basicTest() { + void basicTest() + { params = ::testing::TestWithParam::GetParam(); raft::handle_t handle; - uint32_t k = params.k; + uint32_t k = params.k; float weight = params.weight; - auto metric = params.metric; + auto metric = params.metric; std::vector h_train_inputs = spatial_data; @@ -197,25 +239,37 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { // Allocate input rmm::device_uvector d_train_inputs(n * d, handle.get_stream()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, - handle.get_stream()); + raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream()); if (metric == raft::distance::DistanceType::Haversine) { - thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(), + thrust::transform(handle.get_thrust_policy(), + d_train_inputs.data(), d_train_inputs.data() + d_train_inputs.size(), - d_train_inputs.data(), ToRadians()); + d_train_inputs.data(), + ToRadians()); } - cudaStream_t *int_streams = nullptr; - std::vector *translations = nullptr; + cudaStream_t* int_streams = nullptr; + std::vector* translations = nullptr; - std::vector input_vec = {d_train_inputs.data()}; + std::vector input_vec = {d_train_inputs.data()}; std::vector sizes_vec = {n}; - raft::spatial::knn::detail::brute_force_knn_impl( - input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(), - d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true, - translations, metric); + raft::spatial::knn::detail::brute_force_knn_impl(input_vec, + sizes_vec, + d, + d_train_inputs.data(), + n, + d_ref_I.data(), + d_ref_D.data(), + k, + handle.get_stream(), + int_streams, + 0, + true, + true, + translations, + metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); @@ -223,11 +277,10 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { rmm::device_uvector d_pred_I(n * k, handle.get_stream()); rmm::device_uvector d_pred_D(n * k, handle.get_stream()); - BallCoverIndex index(handle, d_train_inputs.data(), n, - d, metric); + BallCoverIndex index(handle, d_train_inputs.data(), n, d, metric); - raft::spatial::knn::rbc_all_knn_query(handle, index, k, d_pred_I.data(), - d_pred_D.data(), true, weight); + raft::spatial::knn::rbc_all_knn_query( + handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The @@ -235,12 +288,19 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { // can be nondeterministic. rmm::device_uvector discrepancies(n, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), discrepancies.data(), - discrepancies.data() + discrepancies.size(), 0); + thrust::fill(handle.get_thrust_policy(), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); // - uint32_t res = count_discrepancies( - d_ref_I.data(), d_pred_I.data(), d_ref_D.data(), d_pred_D.data(), n, k, - discrepancies.data(), handle.get_stream()); + uint32_t res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + n, + k, + discrepancies.data(), + handle.get_stream()); ASSERT_TRUE(res == 0); } @@ -265,9 +325,11 @@ const std::vector ballcover_inputs = { {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded}, }; -INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, BallCoverAllKNNTestF, +INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, + BallCoverAllKNNTestF, ::testing::ValuesIn(ballcover_inputs)); -INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, BallCoverKNNQueryTestF, +INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, + BallCoverKNNQueryTestF, ::testing::ValuesIn(ballcover_inputs)); TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); } diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 4930b47e0c..e48a3c6657 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -49,20 +49,25 @@ struct idx_dist_pair { IdxT idx; DistT dist; compareDist eq_compare; - bool operator==(const idx_dist_pair &a) const { + bool operator==(const idx_dist_pair& a) const + { if (idx == a.idx) return true; if (eq_compare(dist, a.dist)) return true; return false; } - idx_dist_pair(IdxT x, DistT y, compareDist op) - : idx(x), dist(y), eq_compare(op) {} + idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {} }; template -testing::AssertionResult devArrMatchKnnPair( - const T *expected_idx, const T *actual_idx, const DistT *expected_dist, - const DistT *actual_dist, size_t rows, size_t cols, const DistT eps, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchKnnPair(const T* expected_idx, + const T* actual_idx, + const DistT* expected_dist, + const DistT* actual_dist, + size_t rows, + size_t cols, + const DistT eps, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_idx_h(new T[size]); std::unique_ptr act_idx_h(new T[size]); @@ -75,9 +80,9 @@ testing::AssertionResult devArrMatchKnnPair( CUDA_CHECK(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { - auto idx = i * cols + j; // row major assumption! - auto exp_idx = exp_idx_h.get()[idx]; - auto act_idx = act_idx_h.get()[idx]; + auto idx = i * cols + j; // row major assumption! + auto exp_idx = exp_idx_h.get()[idx]; + auto act_idx = act_idx_h.get()[idx]; auto exp_dist = exp_dist_h.get()[idx]; auto act_dist = act_dist_h.get()[idx]; idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox(eps)); @@ -85,8 +90,7 @@ testing::AssertionResult devArrMatchKnnPair( if (!(exp_kvp == act_kvp)) { return testing::AssertionFailure() << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!=" - << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i - << "," << j; + << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j; } } } @@ -96,26 +100,43 @@ testing::AssertionResult devArrMatchKnnPair( template class FusedL2KNNTest : public ::testing::TestWithParam { protected: - void testBruteForce() { + void testBruteForce() + { cudaStream_t stream = handle_.get_stream(); launchFaissBfknn(); - detail::fusedL2Knn(dim, raft_indices_, raft_distances_, database, - search_queries, num_db_vecs, num_queries, k_, true, true, - stream, metric); + detail::fusedL2Knn(dim, + raft_indices_, + raft_distances_, + database, + search_queries, + num_db_vecs, + num_queries, + k_, + true, + true, + stream, + metric); // verify. - devArrMatchKnnPair(faiss_indices_, raft_indices_, faiss_distances_, - raft_distances_, num_queries, k_, float(0.001), stream); + devArrMatchKnnPair(faiss_indices_, + raft_indices_, + faiss_distances_, + raft_distances_, + num_queries, + k_, + float(0.001), + stream); } - void SetUp() override { - params_ = ::testing::TestWithParam::GetParam(); + void SetUp() override + { + params_ = ::testing::TestWithParam::GetParam(); num_queries = params_.num_queries; num_db_vecs = params_.num_db_vecs; - dim = params_.dim; - k_ = params_.k; - metric = params_.metric_; + dim = params_.dim; + k_ = params_.k; + metric = params_.metric_; cudaStream_t stream = handle_.get_stream(); @@ -133,12 +154,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam { raft::allocate(faiss_distances_, num_queries * k_, stream, true); } - void TearDown() override { + void TearDown() override + { cudaStream_t stream = handle_.get_stream(); raft::deallocate_all(stream); } - void launchFaissBfknn() { + void launchFaissBfknn() + { faiss::MetricType m = detail::build_faiss_metric(metric); faiss::gpu::StandardGpuResources gpu_res; @@ -149,18 +172,18 @@ class FusedL2KNNTest : public ::testing::TestWithParam { gpu_res.setDefaultStream(device, handle_.get_stream()); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = 0; - args.k = k_; - args.dims = dim; - args.vectors = database; + args.metric = m; + args.metricArg = 0; + args.k = k_; + args.dims = dim; + args.vectors = database; args.vectorsRowMajor = true; - args.numVectors = num_db_vecs; - args.queries = search_queries; + args.numVectors = num_db_vecs; + args.queries = search_queries; args.queriesRowMajor = true; - args.numQueries = num_queries; - args.outDistances = faiss_distances_; - args.outIndices = faiss_indices_; + args.numQueries = num_queries; + args.outDistances = faiss_distances_; + args.outIndices = faiss_indices_; bfKnn(&gpu_res, args); } @@ -171,12 +194,12 @@ class FusedL2KNNTest : public ::testing::TestWithParam { int num_queries; int num_db_vecs; int dim; - T *database; - T *search_queries; - int64_t *raft_indices_; - T *raft_distances_; - int64_t *faiss_indices_; - T *faiss_distances_; + T* database; + T* search_queries; + int64_t* raft_indices_; + T* raft_distances_; + int64_t* faiss_indices_; + T* faiss_distances_; int k_; raft::distance::DistanceType metric; }; @@ -201,8 +224,7 @@ const std::vector inputs = { typedef FusedL2KNNTest FusedL2KNNTestF; TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); } -INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, - ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, ::testing::ValuesIn(inputs)); } // namespace knn } // namespace spatial diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index 5a45c45bff..bff7665f83 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -35,10 +35,13 @@ class HaversineKNNTest : public ::testing::Test { d_ref_I(0, stream), d_ref_D(0, stream), d_pred_I(0, stream), - d_pred_D(0, stream) {} + d_pred_D(0, stream) + { + } protected: - void basicTest() { + void basicTest() + { // Allocate input d_train_inputs.resize(n * d, stream); @@ -51,35 +54,45 @@ class HaversineKNNTest : public ::testing::Test { d_pred_D.resize(n * n, stream); // make testdata on host - std::vector h_train_inputs = { - 0.71113885, -1.29215058, 0.59613176, -2.08048115, - 0.74932804, -1.33634042, 0.51486728, -1.65962873, - 0.53154002, -1.47049808, 0.72891737, -1.54095137}; + std::vector h_train_inputs = {0.71113885, + -1.29215058, + 0.59613176, + -2.08048115, + 0.74932804, + -1.33634042, + 0.51486728, + -1.65962873, + 0.53154002, + -1.47049808, + 0.72891737, + -1.54095137}; h_train_inputs.resize(d_train_inputs.size()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), - d_train_inputs.size(), stream); - - std::vector h_res_D = { - 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, - 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, - 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, - 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, - 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, - 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; + raft::update_device( + d_train_inputs.data(), h_train_inputs.data(), d_train_inputs.size(), stream); + + std::vector h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, + 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, + 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, + 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, + 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, + 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); raft::update_device(d_ref_D.data(), h_res_D.data(), n * n, stream); - std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, - 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, - 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; + std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, + 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); - raft::update_device(d_ref_I.data(), h_res_I.data(), n * n, - stream); + raft::update_device(d_ref_I.data(), h_res_I.data(), n * n, stream); - raft::spatial::knn::detail::haversine_knn( - d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(), - d_train_inputs.data(), n, n, k, stream); + raft::spatial::knn::detail::haversine_knn(d_pred_I.data(), + d_pred_D.data(), + d_train_inputs.data(), + d_train_inputs.data(), + n, + n, + k, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -106,11 +119,11 @@ class HaversineKNNTest : public ::testing::Test { typedef HaversineKNNTest HaversineKNNTestF; -TEST_F(HaversineKNNTestF, Fit) { - ASSERT_TRUE(raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, - raft::CompareApprox(1e-3))); - ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, - raft::Compare())); +TEST_F(HaversineKNNTestF, Fit) +{ + ASSERT_TRUE( + raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox(1e-3))); + ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare())); } } // namespace knn diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 35a82b1e53..49e5aaab4b 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -36,17 +36,17 @@ struct KNNInputs { std::vector labels; }; -__global__ void build_actual_output(int *output, int n_rows, int k, - const int *idx_labels, - const int64_t *indices) { +__global__ void build_actual_output( + int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) +{ int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; output[element] = idx_labels[indices[element]]; } -__global__ void build_expected_output(int *output, int n_rows, int k, - const int *labels) { +__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels) +{ int row = threadIdx.x + blockDim.x * blockIdx.x; if (row >= n_rows) return; @@ -68,23 +68,33 @@ class KNNTest : public ::testing::TestWithParam { search_data_(0, stream), indices_(0, stream), distances_(0, stream), - search_labels_(0, stream) {} + search_labels_(0, stream) + { + } protected: - void testBruteForce() { - raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, - std::cout); + void testBruteForce() + { + raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, - std::cout); + raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout); - std::vector input_vec; + std::vector input_vec; std::vector sizes_vec; input_vec.push_back(input_.data()); sizes_vec.push_back(rows_); - brute_force_knn(handle, input_vec, sizes_vec, cols_, search_data_.data(), - rows_, indices_.data(), distances_.data(), k_, true, true); + brute_force_knn(handle, + input_vec, + sizes_vec, + cols_, + search_data_.data(), + rows_, + indices_.data(), + distances_.data(), + k_, + true, + true); build_actual_output<<>>( actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data()); @@ -92,14 +102,15 @@ class KNNTest : public ::testing::TestWithParam { build_expected_output<<>>( expected_labels_.data(), rows_, k_, search_labels_.data()); - ASSERT_TRUE(devArrMatch(expected_labels_.data(), actual_labels_.data(), - rows_ * k_, raft::Compare())); + ASSERT_TRUE(devArrMatch( + expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare())); } - void SetUp() override { + void SetUp() override + { rows_ = params_.input.size(); cols_ = params_.input[0].size(); - k_ = params_.k; + k_ = params_.k; actual_labels_.resize(rows_ * k_, stream); expected_labels_.resize(rows_ * k_, stream); @@ -109,20 +120,17 @@ class KNNTest : public ::testing::TestWithParam { distances_.resize(rows_ * k_, stream); search_labels_.resize(rows_, stream); - CUDA_CHECK(cudaMemsetAsync(actual_labels_.data(), 0, - actual_labels_.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(expected_labels_.data(), 0, - expected_labels_.size() * sizeof(int), stream)); CUDA_CHECK( - cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(search_data_.data(), 0, - search_data_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, - indices_.size() * sizeof(int64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, - distances_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(search_labels_.data(), 0, - search_labels_.size() * sizeof(int), stream)); + cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); + CUDA_CHECK( + cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream)); + CUDA_CHECK( + cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream)); std::vector row_major_input; for (std::size_t i = 0; i < params_.input.size(); ++i) { @@ -130,13 +138,13 @@ class KNNTest : public ::testing::TestWithParam { row_major_input.push_back(params_.input[i][j]); } } - rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), stream); - float *input_ptr = static_cast(input_d.data()); + rmm::device_buffer input_d = + rmm::device_buffer(row_major_input.data(), row_major_input.size() * sizeof(float), stream); + float* input_ptr = static_cast(input_d.data()); - rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), stream); - int *labels_ptr = static_cast(labels_d.data()); + rmm::device_buffer labels_d = + rmm::device_buffer(params_.labels.data(), params_.labels.size() * sizeof(int), stream); + int* labels_ptr = static_cast(labels_d.data()); raft::copy(input_.data(), input_ptr, rows_ * cols_, stream); raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream); diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu index 7742b9bd30..ad6d1e58d1 100644 --- a/cpp/test/spatial/selection.cu +++ b/cpp/test/spatial/selection.cu @@ -45,8 +45,9 @@ struct SparseSelectionInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSelectionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSelectionInputs& dims) +{ return os; } @@ -55,18 +56,20 @@ class SparseSelectionTest : public ::testing::TestWithParam> { public: SparseSelectionTest() - : params(::testing::TestWithParam< - SparseSelectionInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), dists(0, stream), inds(0, stream), out_indices_ref(0, stream), out_dists_ref(0, stream), out_dists(0, stream), - out_indices(0, stream) {} + out_indices(0, stream) + { + } protected: - void make_data() { + void make_data() + { std::vector dists_h = params.dists_h; dists.resize(n_rows * n_cols, stream); @@ -77,36 +80,43 @@ class SparseSelectionTest update_device(dists.data(), dists_h.data(), dists_h.size(), stream); iota_fill(inds.data(), n_rows, n_cols, stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; out_indices_ref.resize(out_indices_ref_h.size(), stream); out_dists_ref.resize(out_dists_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { n_rows = params.n_rows; n_cols = params.n_cols; - k = params.k; + k = params.k; make_data(); - raft::spatial::knn::select_k(dists.data(), inds.data(), n_rows, n_cols, - out_dists.data(), out_indices.data(), - params.select_min, k, stream); + raft::spatial::knn::select_k(dists.data(), + inds.data(), + n_rows, + n_cols, + out_dists.data(), + out_indices.data(), + params.select_min, + k, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, - Compare())); - ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(), - n_rows * k, Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare())); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); } protected: @@ -141,7 +151,8 @@ const std::vector> inputs_i32_f = { true}}; typedef SparseSelectionTest SparseSelectionTestF; TEST_P(SparseSelectionTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, + SparseSelectionTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h index 87891164fc..dbb32c4546 100644 --- a/cpp/test/spatial/spatial_data.h +++ b/cpp/test/spatial/spatial_data.h @@ -5,23 +5,18 @@ namespace spatial { // Latitude and longitude coordinates of 51 US states / territories std::vector spatial_data = { - 63.588753, -154.493062, 32.318231, -86.902298, 35.20105, -91.831833, - 34.048928, -111.093731, 36.778261, -119.417932, 39.550051, -105.782067, - 41.603221, -73.087749, 38.905985, -77.033418, 38.910832, -75.52767, - 27.664827, -81.515754, 32.157435, -82.907123, 19.898682, -155.665857, - 41.878003, -93.097702, 44.068202, -114.742041, 40.633125, -89.398528, - 40.551217, -85.602364, 39.011902, -98.484246, 37.839333, -84.270018, - 31.244823, -92.145024, 42.407211, -71.382437, 39.045755, -76.641271, - 45.253783, -69.445469, 44.314844, -85.602364, 46.729553, -94.6859, - 37.964253, -91.831833, 32.354668, -89.398528, 46.879682, -110.362566, - 35.759573, -79.0193, 47.551493, -101.002012, 41.492537, -99.901813, - 43.193852, -71.572395, 40.058324, -74.405661, 34.97273, -105.032363, - 38.80261, -116.419389, 43.299428, -74.217933, 40.417287, -82.907123, - 35.007752, -97.092877, 43.804133, -120.554201, 41.203322, -77.194525, - 18.220833, -66.590149, 41.580095, -71.477429, 33.836081, -81.163725, - 43.969515, -99.901813, 35.517491, -86.580447, 31.968599, -99.901813, - 39.32098, -111.093731, 37.431573, -78.656894, 44.558803, -72.577841, - 47.751074, -120.740139, 43.78444, -88.787868, 38.597626, -80.454903, - 43.075968, -107.290284}; + 63.588753, -154.493062, 32.318231, -86.902298, 35.20105, -91.831833, 34.048928, -111.093731, + 36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749, 38.905985, -77.033418, + 38.910832, -75.52767, 27.664827, -81.515754, 32.157435, -82.907123, 19.898682, -155.665857, + 41.878003, -93.097702, 44.068202, -114.742041, 40.633125, -89.398528, 40.551217, -85.602364, + 39.011902, -98.484246, 37.839333, -84.270018, 31.244823, -92.145024, 42.407211, -71.382437, + 39.045755, -76.641271, 45.253783, -69.445469, 44.314844, -85.602364, 46.729553, -94.6859, + 37.964253, -91.831833, 32.354668, -89.398528, 46.879682, -110.362566, 35.759573, -79.0193, + 47.551493, -101.002012, 41.492537, -99.901813, 43.193852, -71.572395, 40.058324, -74.405661, + 34.97273, -105.032363, 38.80261, -116.419389, 43.299428, -74.217933, 40.417287, -82.907123, + 35.007752, -97.092877, 43.804133, -120.554201, 41.203322, -77.194525, 18.220833, -66.590149, + 41.580095, -71.477429, 33.836081, -81.163725, 43.969515, -99.901813, 35.517491, -86.580447, + 31.968599, -99.901813, 39.32098, -111.093731, 37.431573, -78.656894, 44.558803, -72.577841, + 47.751074, -120.740139, 43.78444, -88.787868, 38.597626, -80.454903, 43.075968, -107.290284}; }; // namespace spatial }; // namespace raft \ No newline at end of file diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 388ad56f2d..fa54b04cda 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -32,7 +32,8 @@ struct csr_view_t { index_type number_of_edges; }; } // namespace -TEST(Raft, SpectralMatrices) { +TEST(Raft, SpectralMatrices) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -48,7 +49,7 @@ TEST(Raft, SpectralMatrices) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{h, csr_v}; @@ -62,9 +63,7 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, &sm2](void) { - laplacian_matrix_t lm2{h, sm2}; - }; + auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t lm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { @@ -72,9 +71,7 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, &sm2](void) { - modularity_matrix_t mm2{h, sm2}; - }; + auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t mm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index cf866a5663..b8ea2cb799 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,7 +35,8 @@ struct MeanInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) +{ return os; } @@ -48,20 +49,23 @@ class MeanTest : public ::testing::TestWithParam> { rows(params.rows), cols(params.cols), data(rows * cols, stream), - mean_act(rows * cols, stream) {} + mean_act(rows * cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = rows * cols; r.normal(data.data(), len, params.mean, (T)1.0, stream); meanSGtest(data.data(), stream); } - void meanSGtest(T *data, cudaStream_t stream) { + void meanSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; - mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, - stream); + mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream); } protected: @@ -76,52 +80,52 @@ class MeanTest : public ::testing::TestWithParam> { // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) -const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; - -const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; +const std::vector> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; typedef MeanTest MeanTestF; -TEST_P(MeanTestF, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act.data(), params.cols, CompareApprox(params.tolerance))); } typedef MeanTest MeanTestD; -TEST_P(MeanTestD, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + params.mean, mean_act.data(), params.cols, CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index dcc4b4e551..6a76a289d7 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -34,37 +34,49 @@ struct MeanCenterInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MeanCenterInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs& dims) +{ return os; } template -class MeanCenterTest - : public ::testing::TestWithParam> { +class MeanCenterTest : public ::testing::TestWithParam> { public: MeanCenterTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), rows(params.rows), cols(params.cols), out(rows * cols, stream), out_ref(rows * cols, stream), data(rows * cols, stream), - meanVec(params.bcastAlongRows ? cols : rows, stream) {} + meanVec(params.bcastAlongRows ? cols : rows, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = rows * cols; r.normal(data.data(), len, params.mean, (T)1.0, stream); - raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.sample, - params.rowMajor, stream); - meanCenter(out.data(), data.data(), meanVec.data(), cols, rows, - params.rowMajor, params.bcastAlongRows, stream); - raft::linalg::naiveMatVec(out_ref.data(), data.data(), meanVec.data(), cols, - rows, params.rowMajor, params.bcastAlongRows, + raft::stats::mean( + meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + meanCenter(out.data(), + data.data(), + meanVec.data(), + cols, + rows, + params.rowMajor, + params.bcastAlongRows, + stream); + raft::linalg::naiveMatVec(out_ref.data(), + data.data(), + meanVec.data(), + cols, + rows, + params.rowMajor, + params.bcastAlongRows, (T)-1.0); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -104,12 +116,12 @@ const std::vector> inputsf_i32 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; -TEST_P(MeanCenterTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, @@ -137,12 +149,12 @@ const std::vector> inputsf_i64 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; -TEST_P(MeanCenterTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -170,12 +182,12 @@ const std::vector> inputsd_i32 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; -TEST_P(MeanCenterTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -203,12 +215,12 @@ const std::vector> inputsd_i64 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; -TEST_P(MeanCenterTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 53f392aaf3..3efc54264e 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -34,7 +34,8 @@ struct StdDevInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) +{ return os; } @@ -49,10 +50,13 @@ class StdDevTest : public ::testing::TestWithParam> { data(rows * cols, stream), mean_act(cols, stream), stddev_act(cols, stream), - vars_act(cols, stream) {} + vars_act(cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { random::Rng r(params.seed); int len = rows * cols; @@ -65,17 +69,17 @@ class StdDevTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void stdVarSGtest(T *data, cudaStream_t stream) { + void stdVarSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; - mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, - stream); + mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream); - stddev(stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, - params.rowMajor, stream); + stddev( + stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream); - vars(vars_act.data(), data, mean_act.data(), cols, rows, params.sample, - params.rowMajor, stream); + vars( + vars_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream); raft::matrix::seqRoot(vars_act.data(), T(1), cols, stream); } @@ -126,28 +130,28 @@ const std::vector> inputsd = { {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; typedef StdDevTest StdDevTestF; -TEST_P(StdDevTestF, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + params.stddev, stddev_act.data(), params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + stddev_act.data(), vars_act.data(), params.cols, CompareApprox(params.tolerance))); } typedef StdDevTest StdDevTestD; -TEST_P(StdDevTestD, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + params.stddev, stddev_act.data(), params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + stddev_act.data(), vars_act.data(), params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index ac4d642c8e..ecb1171ea5 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -32,7 +32,8 @@ struct SumInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) +{ return os; } @@ -45,10 +46,13 @@ class SumTest : public ::testing::TestWithParam> { rows(params.rows), cols(params.cols), data(rows * cols, stream), - sum_act(cols, stream) {} + sum_act(cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = rows * cols; T data_h[len]; @@ -77,14 +81,17 @@ const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, {0.05, 1024, 256, 1234ULL}}; typedef SumTest SumTestF; -TEST_P(SumTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + float(params.rows), sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef SumTest SumTestD; -TEST_P(SumTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act.data(), +TEST_P(SumTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(double(params.rows), + sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index 0f135c0121..58b9ae42ae 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -32,15 +32,16 @@ namespace raft { template struct Compare { - bool operator()(const T &a, const T &b) const { return a == b; } + bool operator()(const T& a, const T& b) const { return a == b; } }; template struct CompareApprox { CompareApprox(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(a - b); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); @@ -53,9 +54,10 @@ struct CompareApprox { template struct CompareApproxAbs { CompareApproxAbs(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(abs(a) - abs(b)); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); } @@ -65,25 +67,26 @@ struct CompareApproxAbs { }; template -T abs(const T &a) { +T abs(const T& a) +{ return a > T(0) ? a : -a; } /* - * @brief Helper function to compare 2 device n-D arrays with custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value(s) - * @param actual actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - * @{ - */ + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -93,16 +96,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { - return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i; + return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i; } } return testing::AssertionSuccess(); } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, - L eq_compare, cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -117,9 +120,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, } template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t rows, size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch(const T* expected, + const T* actual, + size_t rows, + size_t cols, + L eq_compare, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); @@ -133,8 +140,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto act = act_h.get()[idx]; if (!eq_compare(exp, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i << "," - << j; + << "actual=" << act << " != expected=" << exp << " @" << i << "," << j; } } } @@ -142,9 +148,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -155,8 +161,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -164,24 +169,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, } /* - * @brief Helper function to compare a device n-D arrays with an expected array - * on the host, using a custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected_h host array of expected value(s) - * @param actual_d device array actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchHost( + const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - bool ok = true; + bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { auto exp = expected_h[i]; @@ -196,19 +201,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, } /* - * @brief Helper function to compare diagonal values of a 2D matrix - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value along diagonal - * @param actual actual matrix - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult diagonalMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -220,8 +225,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -229,10 +233,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, } template -testing::AssertionResult match(const T expected, T actual, L eq_compare) { +testing::AssertionResult match(const T expected, T actual, L eq_compare) +{ if (!eq_compare(expected, actual)) { - return testing::AssertionFailure() - << "actual=" << actual << " != expected=" << expected; + return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected; } return testing::AssertionSuccess(); } @@ -256,8 +260,8 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) { ms /= args.runs; \ } while (0) -inline std::vector read_csv(std::string filename, - bool skip_first_n_columns = 1) { +inline std::vector read_csv(std::string filename, bool skip_first_n_columns = 1) +{ std::vector result; std::ifstream myFile(filename); if (!myFile.is_open()) throw std::runtime_error("Could not open file"); @@ -268,8 +272,7 @@ inline std::vector read_csv(std::string filename, if (myFile.good()) { std::getline(myFile, line); std::stringstream ss(line); - while (std::getline(ss, colname, ',')) { - } + while (std::getline(ss, colname, ',')) {} } int n_lines = 0;