diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f67853597..241d214f49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - Initial RAFT version +- PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes ## Improvements diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 160c3a2c69..cc5bb08907 100644 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -1,7 +1,7 @@ #!/bin/bash # Copyright (c) 2020, NVIDIA CORPORATION. ##################### -# cuML Style Tester # +# RAFT Style Tester # ##################### # Ignore errors and set path @@ -41,15 +41,9 @@ else fi # Check for a consistent #include syntax -# TODO: keep adding more dirs as and when we update the syntax HASH_INCLUDE=`python cpp/scripts/include_checker.py \ - cpp/bench \ - cpp/comms/mpi/include \ - cpp/comms/mpi/src \ - cpp/comms/std/include \ - cpp/comms/std/src \ cpp/include \ - cpp/examples \ + cpp/test \ 2>&1` HASH_RETVAL=$? if [ "$RETVAL" = "0" ]; then @@ -66,7 +60,6 @@ else fi # Check for a consistent code format -# TODO: keep adding more dirs when we add more source folders in cuml FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` FORMAT_RETVAL=$? if [ "$RETVAL" = "0" ]; then diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 62a511eaf0..6c3fc0dc7b 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -108,9 +108,7 @@ nvidia-smi logger "GoogleTest for raft..." cd $WORKSPACE/cpp/build -# Googletests haven't been moved over/integrated yet -# GTEST_OUTPUT="xml:${WORKSPACE}/test-results/raft_cpp/" ./test_raft -# running simple tests meanwhile +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/raft_cpp/" ./test_raft logger "Python pytest for cuml..." cd $WORKSPACE/python diff --git a/cpp/.clang-format b/cpp/.clang-format new file mode 100644 index 0000000000..779ca0033a --- /dev/null +++ b/cpp/.clang-format @@ -0,0 +1,157 @@ +--- +# Refer to the following link for the explanation of each params: +# http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +# This is deprecated +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + # disabling the below splits, else, they'll just add to the vertical length of source files! + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 2 +Cpp11BracedListStyle: true +DerivePointerAlignment: true +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^' + Priority: 2 + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +# Enabling comment reflow causes doxygen comments to be messed up in their formats! +ReflowComments: false +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +# We are C++14, but clang-format puts this under `Cpp11` itself +Standard: Cpp11 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +# Be consistent with indent-width, even for people who use tab for indentation! +TabWidth: 2 +UseTab: Never +... diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy new file mode 100644 index 0000000000..30d96069b0 --- /dev/null +++ b/cpp/.clang-tidy @@ -0,0 +1,230 @@ +--- +# Refer to the following link for the explanation of each params: +# https://releases.llvm.org/8.0.1/tools/clang/tools/extra/docs/clang-tidy/checks/list.html +Checks: 'clang-diagnostic-*,clang-analyzer-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming,-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming' +WarningsAsErrors: '' +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +FormatStyle: none +User: snanditale +CheckOptions: + - key: google-build-namespaces.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: google-global-names-in-headers.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.BranchThreshold + value: '4294967295' + - key: google-readability-function-size.LineThreshold + value: '4294967295' + - key: google-readability-function-size.NestingThreshold + value: '4294967295' + - key: google-readability-function-size.ParameterThreshold + value: '4294967295' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-function-size.VariableThreshold + value: '4294967295' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' + - key: google-runtime-int.SignedTypePrefix + value: int + - key: google-runtime-int.TypeSuffix + value: '' + - key: google-runtime-int.UnsignedTypePrefix + value: uint + - key: google-runtime-references.WhiteListTypes + value: '' + - key: modernize-loop-convert.MaxCopySize + value: '16' + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: modernize-loop-convert.NamingStyle + value: CamelCase + - key: modernize-pass-by-value.IncludeStyle + value: llvm + - key: modernize-pass-by-value.ValuesOnly + value: '0' + - key: modernize-replace-auto-ptr.IncludeStyle + value: llvm + - key: modernize-replace-random-shuffle.IncludeStyle + value: llvm + - key: modernize-use-auto.MinTypeNameLength + value: '5' + - key: modernize-use-auto.RemoveStars + value: '0' + - key: modernize-use-default-member-init.IgnoreMacros + value: '1' + - key: modernize-use-default-member-init.UseAssignment + value: '0' + - key: modernize-use-emplace.ContainersWithPushBack + value: '::std::vector;::std::list;::std::deque' + - key: modernize-use-emplace.SmartPointers + value: '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr' + - key: modernize-use-emplace.TupleMakeFunctions + value: '::std::make_pair;::std::make_tuple' + - key: modernize-use-emplace.TupleTypes + value: '::std::pair;::std::tuple' + - key: modernize-use-equals-default.IgnoreMacros + value: '1' + - key: modernize-use-noexcept.ReplacementString + value: '' + - key: modernize-use-noexcept.UseNoexceptFalse + value: '1' + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + - key: modernize-use-transparent-functors.SafeMode + value: '0' + - key: modernize-use-using.IgnoreMacros + value: '1' + - key: readability-identifier-naming.AbstractClassCase + value: lower_case + - key: readability-identifier-naming.AbstractClassPrefix + value: '' + - key: readability-identifier-naming.AbstractClassSuffix + value: '' + - key: readability-identifier-naming.ClassCase + value: lower_case + - key: readability-identifier-naming.ClassPrefix + value: '' + - key: readability-identifier-naming.ClassSuffix + value: '' + - key: readability-identifier-naming.ClassConstantCase + value: CamelCase + - key: readability-identifier-naming.ClassConstantPrefix + value: 'k' + - key: readability-identifier-naming.ClassConstantSuffix + value: '' + - key: readability-identifier-naming.ClassMemberCase + value: lower_case + - key: readability-identifier-naming.ClassMemberPrefix + value: '' + - key: readability-identifier-naming.ClassMemberSuffix + value: '_' + - key: readability-identifier-naming.ClassMethodCase + value: lower_case + - key: readability-identifier-naming.ClassMethodPrefix + value: '' + - key: readability-identifier-naming.ClassMethodSuffix + value: '' + - key: readability-identifier-naming.ConstexprFunctionCase + value: lower_case + - key: readability-identifier-naming.ConstexprFunctionPrefix + value: '' + - key: readability-identifier-naming.ConstexprFunctionSuffix + value: '' + - key: readability-identifier-naming.ConstexprMethodCase + value: lower_case + - key: readability-identifier-naming.ConstexprMethodPrefix + value: '' + - key: readability-identifier-naming.ConstexprMethodSuffix + value: '' + - key: readability-identifier-naming.ConstexprVariableCase + value: CamelCase + - key: readability-identifier-naming.ConstexprVariablePrefix + value: 'k' + - key: readability-identifier-naming.ConstexprVariableSuffix + value: '' + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.EnumPrefix + value: '' + - key: readability-identifier-naming.EnumSuffix + value: '' + - key: readability-identifier-naming.EnumConstantCase + value: CamelCase + - key: readability-identifier-naming.EnumConstantPrefix + value: 'k' + - key: readability-identifier-naming.EnumConstantSuffix + value: '' + - key: readability-identifier-naming.FunctionCase + value: lower_case + - key: readability-identifier-naming.FunctionPrefix + value: '' + - key: readability-identifier-naming.FunctionSuffix + value: '' + - key: readability-identifier-naming.GlobalConstantCase + value: CamelCase + - key: readability-identifier-naming.GlobalConstantPrefix + value: 'k' + - key: readability-identifier-naming.GlobalConstantSuffix + value: '' + - key: readability-identifier-naming.IgnoreFailedSplit + value: '0' + - key: readability-identifier-naming.LocalVariableCase + value: 'lower_case' + - key: readability-identifier-naming.LocalVariablePrefix + value: '' + - key: readability-identifier-naming.LocalVariableSuffix + value: '' + - key: readability-identifier-naming.ConstExprVariableCase + value: 'CamelCase' + - key: readability-identifier-naming.ConstExprVariablePrefix + value: 'k' + - key: readability-identifier-naming.ConstExprVariableSuffix + value: '' + - key: readability-identifier-naming.MemberCase + value: lower_case + - key: readability-identifier-naming.MemberPrefix + value: '' + - key: readability-identifier-naming.MemberSuffix + value: '' + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.NamespacePrefix + value: '' + - key: readability-identifier-naming.NamespaceSuffix + value: '' + - key: readability-identifier-naming.PrivateMemberCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberPrefix + value: '' + - key: readability-identifier-naming.PrivateMemberSuffix + value: '_' + - key: readability-identifier-naming.ProtectedMemberCase + value: lower_case + - key: readability-identifier-naming.ProtectedMemberPrefix + value: '' + - key: readability-identifier-naming.ProtectedMemberSuffix + value: '_' + - key: readability-identifier-naming.StaticConstantCase + value: CamelCase + - key: readability-identifier-naming.StaticConstantPrefix + value: 'k' + - key: readability-identifier-naming.StaticConstantSuffix + value: '' + - key: readability-identifier-naming.StructCase + value: lower_case + - key: readability-identifier-naming.StructPrefix + value: '' + - key: readability-identifier-naming.StructSuffix + value: '' + - key: readability-identifier-naming.TypeAliasCase + value: lower_case + - key: readability-identifier-naming.TypeAliasPrefix + value: '' + - key: readability-identifier-naming.TypeAliasSuffix + value: '' + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-identifier-naming.TypeTemplateParameterPrefix + value: '' + - key: readability-identifier-naming.TypeTemplateParameterSuffix + value: '' + - key: readability-identifier-naming.TypedefCase + value: lower_case + - key: readability-identifier-naming.TypedefPrefix + value: '' + - key: readability-identifier-naming.TypedefSuffix + value: '' + - key: readability-identifier-naming.VariableCase + value: lower_case + - key: readability-identifier-naming.VariablePrefix + value: '' + - key: readability-identifier-naming.VariableSuffix + value: '' +... + diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7163b0dd77..947d0318cb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) -project(CUML VERSION 0.14.0 LANGUAGES CXX CUDA) +project(RAFT VERSION 0.14.0 LANGUAGES CXX CUDA) ############################################################################## # - build type --------------------------------------------------------------- @@ -34,9 +34,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) "Debug" "Release") endif() +# this is needed for clang-tidy runs +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + ############################################################################## # - User Options ------------------------------------------------------------ +option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) + option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" ON) option(KERNEL_INFO "Enable kernel resource usage info" OFF) @@ -45,6 +50,8 @@ option(LINE_INFO "Enable lineinfo in nvcc" OFF) option(NVTX "Enable nvtx markers" OFF) +option(BUILD_RAFT_TESTS "Build raft unit-tests" ON) + set(PARALLEL_LEVEL "" CACHE STRING "Sub-projects parallel level for compilation. Currently only affects FAISS" ) @@ -53,12 +60,13 @@ set(GPU_ARCHS "" CACHE STRING Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system") - ############################################################################## # - Requirements ------------------------------------------------------------- find_package(CUDA 10.0 REQUIRED) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + ############################################################################## # - Compiler Options -------------------------------------------------------- @@ -85,7 +93,7 @@ endif(OPENMP_FOUND) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") if(${CMAKE_VERSION} VERSION_LESS "3.17.0") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14") endif(${CMAKE_VERSION} VERSION_LESS "3.17.0") if(LINE_INFO) @@ -149,8 +157,10 @@ endif(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=unrecognized_gcc_pragma") +############################################################################## +# - dependencies ------------------------------------------------------------- - +include(cmake/Dependencies.cmake) ############################################################################## # - include paths ------------------------------------------------------------ @@ -159,17 +169,61 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING "Path to RAFT include directories") set(RAFT_INCLUDE_DIRECTORIES - ${RAFT_INCLUDE_DIR} -) + ${RAFT_INCLUDE_DIR} + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + +if(DEFINED ENV{CONDA_PREFIX}) + message(STATUS "Using RMM installation froM $ENV{CONDA_PREFIX}") + list(APPEND RAFT_INCLUDE_DIRECTORIES $ENV{CONDA_PREFIX}/include) +endif(DEFINED ENV{CONDA_PREFIX}) ############################################################################## -# - build test executables --------------------------------------------------- +# - libraries ---------------------------------------------------------------- + +set(RAFT_LINK_LIBRARIES + ${CUDA_cublas_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_CUDART_LIBRARY} + ${CUDA_cusparse_LIBRARY} + rmm) -add_executable(test_raft - test/test.cpp) +set(RAFT_LINK_DIRECTORIES "") -target_include_directories(test_raft PRIVATE - ${RAFT_INCLUDE_DIRECTORIES}) +if(DEFINED ENV{CONDA_PREFIX}) + list(APPEND RAFT_LINK_DIRECTORIES $ENV{CONDA_PREFIX}/lib) +endif(DEFINED ENV{CONDA_PREFIX}) + +############################################################################## +# - build test executable ---------------------------------------------------- + +if(BUILD_RAFT_TESTS) + find_package(OpenMP REQUIRED) + + # keep the files in alphabetical order! + add_executable(test_raft + test/cudart_utils.cpp + test/handle.cpp + test/mr/device/buffer.cpp + test/mr/host/buffer.cpp + test/test.cpp) + + target_include_directories(test_raft + PRIVATE + ${RAFT_INCLUDE_DIRECTORIES} + ${GTEST_DIR}/include) + + target_link_directories(test_raft + PRIVATE + ${RAFT_LINK_DIRECTORIES}) + + target_link_libraries(test_raft + PRIVATE + ${RAFT_LINK_LIBRARIES} + gtestlib + gtest_mainlib + OpenMP::OpenMP_CXX + Threads::Threads) +endif(BUILD_RAFT_TESTS) ############################################################################## # - doxygen targets ---------------------------------------------------------- diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake new file mode 100644 index 0000000000..d9d15a0ea8 --- /dev/null +++ b/cpp/cmake/Dependencies.cmake @@ -0,0 +1,42 @@ +#============================================================================= +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +include(ExternalProject) + +############################################################################## +# - googletest --------------------------------------------------------------- + +set(GTEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest CACHE STRING + "Path to googletest repo") +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG 6ce9b98f541b8bcd84c5c5b3483f29a933c4aefb + PREFIX ${GTEST_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_LIBDIR=lib + BUILD_BYPRODUCTS ${GTEST_DIR}/lib/libgtest.a + ${GTEST_DIR}/lib/libgtest_main.a + UPDATE_COMMAND "") +add_library(gtestlib STATIC IMPORTED) +add_library(gtest_mainlib STATIC IMPORTED) +set_property(TARGET gtestlib PROPERTY + IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest.a) +set_property(TARGET gtest_mainlib PROPERTY + IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest_main.a) +add_dependencies(gtestlib googletest) +add_dependencies(gtest_mainlib googletest) diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index 0c12fb09dc..f380d276b2 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -26,4 +26,4 @@ inline std::string test_raft() { return status; } -} // namespace raft +} // namespace raft diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h new file mode 100644 index 0000000000..8bd4caf121 --- /dev/null +++ b/cpp/include/raft/cudart_utils.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +///@todo: enable once logging has been enabled in raft +//#include "logger.hpp" + +namespace raft { + +/** base exception class for the whole of raft */ +class exception : public std::exception { + public: + /** default ctor */ + explicit exception() noexcept : std::exception(), msg_() {} + + /** copy ctor */ + exception(const exception& src) noexcept + : std::exception(), msg_(src.what()) { + collect_call_stack(); + } + + /** ctor from an input message */ + explicit exception(const std::string _msg) noexcept + : std::exception(), msg_(std::move(_msg)) { + collect_call_stack(); + } + + /** get the message associated with this exception */ + const char* what() const noexcept override { return msg_.c_str(); } + + private: + /** message associated with this exception */ + std::string msg_; + + /** append call stack info to this exception's message for ease of debug */ + // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html + void collect_call_stack() noexcept { +#ifdef __GNUC__ + constexpr int kMaxStackDepth = 64; + void* stack[kMaxStackDepth]; // NOLINT + auto depth = backtrace(stack, kMaxStackDepth); + std::ostringstream oss; + oss << std::endl << "Obtained " << depth << " stack frames" << std::endl; + char** strings = backtrace_symbols(stack, depth); + if (strings == nullptr) { + oss << "But no stack trace could be found!" << std::endl; + msg_ += oss.str(); + return; + } + ///@todo: support for demangling of C++ symbol names + for (int i = 0; i < depth; ++i) { + oss << "#" << i << " in " << strings[i] << std::endl; + } + free(strings); + msg_ += oss.str(); +#endif // __GNUC__ + } +}; + +/** macro to throw a runtime error */ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf(errMsg, sizeof(errMsg), \ + "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ + } while (0) + +/** macro to check for a conditional and assert on failure */ +#define ASSERT(check, fmt, ...) \ + do { \ + if (!(check)) THROW(fmt, ##__VA_ARGS__); \ + } while (0) + +/** check for cuda runtime API errors and assert accordingly */ +#define CUDA_CHECK(call) \ + do { \ + cudaError_t status = call; \ + ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \ + cudaGetErrorString(status)); \ + } while (0) + +///@todo: enable this only after we have added logging support in raft +// /** +// * @brief check for cuda runtime API errors but log error instead of raising +// * exception. +// */ +// #define CUDA_CHECK_NO_THROW(call) \ +// do { \ +// cudaError_t status = call; \ +// if (status != cudaSuccess) { \ +// RAFT_LOG_ERROR("CUDA call='%s' at file=%s line=%d failed with %s ", \ +// #call, __FILE__, __LINE__, cudaGetErrorString(status)); \ +// } \ +// } while (0) + +/** helper method to get max usable shared mem per block parameter */ +inline int get_shared_memory_per_block() { + int dev_id; + CUDA_CHECK(cudaGetDevice(&dev_id)); + int smem_per_blk; + CUDA_CHECK(cudaDeviceGetAttribute( + &smem_per_blk, cudaDevAttrMaxSharedMemoryPerBlock, dev_id)); + return smem_per_blk; +} +/** helper method to get multi-processor count parameter */ +inline int get_multi_processor_count() { + int dev_id; + CUDA_CHECK(cudaGetDevice(&dev_id)); + int mp_count; + CUDA_CHECK( + cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, dev_id)); + return mp_count; +} + +/** Helper method to get to know warp size in device code */ +constexpr inline int warp_size() { return 32; } + +/** + * @brief Generic copy method for all kinds of transfers + * @tparam Type data type + * @param dst destination pointer + * @param src source pointer + * @param len lenth of the src/dst buffers in terms of number of elements + * @param stream cuda stream + */ +template +void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { + CUDA_CHECK( + cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +} + +/** + * @defgroup Copy Copy methods + * These are here along with the generic 'copy' method in order to improve + * code readability using explicitly specified function names + * @{ + */ +/** performs a host to device copy */ +template +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, + cudaStream_t stream) { + copy(d_ptr, h_ptr, len, stream); +} + +/** performs a device to host copy */ +template +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, + cudaStream_t stream) { + copy(h_ptr, d_ptr, len, stream); +} + +template +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, + cudaStream_t stream) { + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), + cudaMemcpyDeviceToDevice, stream)); +} +/** @} */ + +/** + * @defgroup Debug Utils for debugging host/device buffers + * @{ + */ +template +void print_host_vector(const char* variable_name, const T* host_mem, + size_t componentsCount, OutStream& out) { + out << variable_name << "=["; + for (size_t i = 0; i < componentsCount; ++i) { + if (i != 0) out << ","; + out << host_mem[i]; + } + out << "];\n"; +} + +template +void print_device_vector(const char* variable_name, const T* devMem, + size_t componentsCount, OutStream& out) { + T* host_mem = new T[componentsCount]; + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), + cudaMemcpyDeviceToHost)); + print_host_vector(variable_name, host_mem, componentsCount, out); + delete[] host_mem; +} +/** @} */ + +}; // namespace raft diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp new file mode 100644 index 0000000000..81e63342ce --- /dev/null +++ b/cpp/include/raft/handle.hpp @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +///@todo: enable once we have migrated cuml-comms layer too +//#include + +#include +#include +#include +#include +#include +#include "cudart_utils.h" + +namespace raft { + +/** + * @brief Main handle object that stores all necessary context used for calling + * necessary cuda kernels and/or libraries + */ +class handle_t { + private: + static constexpr int kNumDefaultWorkerStreams = 0; + + public: + /** + * @brief Construct a handle with the specified number of worker streams + * + * @param[in] n_streams number worker streams to be created + */ + explicit handle_t(int n_streams = kNumDefaultWorkerStreams) + : dev_id_([]() -> int { + int cur_dev = -1; + CUDA_CHECK(cudaGetDevice(&cur_dev)); + return cur_dev; + }()), + num_streams_(n_streams), + device_allocator_(std::make_shared()), + host_allocator_(std::make_shared()) { + create_resources(); + } + + /** Destroys all held-up resources */ + ~handle_t() { destroy_resources(); } + + int get_device() const { return dev_id_; } + + void set_stream(cudaStream_t stream) { user_stream_ = stream; } + cudaStream_t get_stream() const { return user_stream_; } + + void set_device_allocator(std::shared_ptr allocator) { + device_allocator_ = allocator; + } + std::shared_ptr get_device_allocator() const { + return device_allocator_; + } + + void set_host_allocator(std::shared_ptr allocator) { + host_allocator_ = allocator; + } + std::shared_ptr get_host_allocator() const { + return host_allocator_; + } + + cublasHandle_t get_cublas_handle() const { + std::lock_guard _(mutex_); + if (!cublas_initialized_) { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + cublas_initialized_ = true; + } + return cublas_handle_; + } + + cusolverDnHandle_t get_cusolver_dn_handle() const { + std::lock_guard _(mutex_); + if (!cusolver_dn_initialized_) { + CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); + cusolver_dn_initialized_ = true; + } + return cusolver_dn_handle_; + } + + cusolverSpHandle_t get_cusolver_sp_handle() const { + std::lock_guard _(mutex_); + if (!cusolver_sp_initialized_) { + CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); + cusolver_sp_initialized_ = true; + } + return cusolver_sp_handle_; + } + + cusparseHandle_t get_cusparse_handle() const { + std::lock_guard _(mutex_); + if (!cusparse_initialized_) { + CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); + cusparse_initialized_ = true; + } + return cusparse_handle_; + } + + cudaStream_t get_internal_stream(int sid) const { return streams_[sid]; } + int get_num_internal_streams() const { return num_streams_; } + std::vector get_internal_streams() const { + std::vector int_streams_vec(num_streams_); + for (auto s : streams_) { + int_streams_vec.push_back(s); + } + return int_streams_vec; + } + + void wait_on_user_stream() const { + CUDA_CHECK(cudaEventRecord(event_, user_stream_)); + for (auto s : streams_) { + CUDA_CHECK(cudaStreamWaitEvent(s, event_, 0)); + } + } + + void wait_on_internal_streams() const { + for (auto s : streams_) { + CUDA_CHECK(cudaEventRecord(event_, s)); + CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); + } + } + + ///@todo: enable this once we have cuml-comms migrated + // void setCommunicator( + // std::shared_ptr communicator); + // const MLCommon::cumlCommunicator& getCommunicator() const; + // bool commsInitialized() const; + + const cudaDeviceProp& get_device_properties() const { + std::lock_guard _(mutex_); + if (!device_prop_initialized_) { + CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); + device_prop_initialized_ = true; + } + return prop_; + } + + private: + const int dev_id_; + const int num_streams_; + std::vector streams_; + mutable cublasHandle_t cublas_handle_; + mutable bool cublas_initialized_{false}; + mutable cusolverDnHandle_t cusolver_dn_handle_; + mutable bool cusolver_dn_initialized_{false}; + mutable cusolverSpHandle_t cusolver_sp_handle_; + mutable bool cusolver_sp_initialized_{false}; + mutable cusparseHandle_t cusparse_handle_; + mutable bool cusparse_initialized_{false}; + std::shared_ptr device_allocator_; + std::shared_ptr host_allocator_; + cudaStream_t user_stream_{nullptr}; + cudaEvent_t event_; + mutable cudaDeviceProp prop_; + mutable bool device_prop_initialized_{false}; + mutable std::mutex mutex_; + + ///@todo: enable this once we have migrated cuml-comms + //std::shared_ptr _communicator; + + void create_resources() { + for (int i = 0; i < num_streams_; ++i) { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + streams_.push_back(stream); + } + CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + + void destroy_resources() { + ///@todo: enable *_NO_THROW variants once we have enabled logging + if (cusparse_initialized_) { + //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); + } + if (cusolver_dn_initialized_) { + //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); + } + if (cusolver_sp_initialized_) { + //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); + } + if (cublas_initialized_) { + //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + } + while (!streams_.empty()) { + //CUDA_CHECK_NO_THROW(cudaStreamDestroy(streams_.back())); + CUDA_CHECK(cudaStreamDestroy(streams_.back())); + streams_.pop_back(); + } + //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + CUDA_CHECK(cudaEventDestroy(event_)); + } +}; // class handle_t + +/** + * @brief RAII approach to synchronizing across all streams in the handle + */ +class stream_syncer { + public: + explicit stream_syncer(const handle_t& handle) : handle_(handle) { + handle_.wait_on_user_stream(); + } + ~stream_syncer() { handle_.wait_on_internal_streams(); } + + stream_syncer(const stream_syncer& other) = delete; + stream_syncer& operator=(const stream_syncer& other) = delete; + + private: + const handle_t& handle_; +}; // class stream_syncer + +} // namespace raft diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h new file mode 100644 index 0000000000..cd8a508a84 --- /dev/null +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +///@todo: enable this once we have logger enabled +//#include +#include +#include + +namespace raft { +namespace linalg { + +#define _CUBLAS_ERR_TO_STR(err) \ + case err: \ + return #err +inline const char *cublas_error_to_string(cublasStatus_t err) { + switch (err) { + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); + _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); + default: + return "CUBLAS_STATUS_UNKNOWN"; + }; +} +#undef _CUBLAS_ERR_TO_STR + +/** check for cublas runtime API errors and assert accordingly */ +#define CUBLAS_CHECK(call) \ + do { \ + cublasStatus_t err = call; \ + ASSERT(err == CUBLAS_STATUS_SUCCESS, \ + "CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ + raft::linalg::cublas_error_to_string(err)); \ + } while (0) + +///@todo: enable this once we have logging enabled +// /** check for cublas runtime API errors but do not assert */ +// #define CUBLAS_CHECK_NO_THROW(call) \ +// do { \ +// cublasStatus_t err = call; \ +// if (err != CUBLAS_STATUS_SUCCESS) { \ +// CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ +// raft::linalg::cublas_error_to_string(err)); \ +// } \ +// } while (0) + +/** + * @defgroup Axpy cublas ax+y operations + * @{ + */ +template +cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, + const T *x, int incx, T *y, int incy, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, + const float *alpha, const float *x, int incx, + float *y, int incy, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSaxpy(handle, n, alpha, x, incx, y, incy); +} + +template <> +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, + const double *alpha, const double *x, int incx, + double *y, int incy, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDaxpy(handle, n, alpha, x, incx, y, incy); +} +/** @} */ + +/** + * @defgroup gemv cublas gemv calls + * @{ + */ +template +cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, + int m, int n, const T *alfa, const T *A, int lda, + const T *x, int incx, const T *beta, T *y, int incy, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, int m, int n, + const float *alfa, const float *A, int lda, + const float *x, int incx, const float *beta, + float *y, int incy, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, + incy); +} + +template <> +inline cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, int m, int n, + const double *alfa, const double *A, int lda, + const double *x, int incx, const double *beta, + double *y, int incy, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, + incy); +} +/** @} */ + +/** + * @defgroup ger cublas a(x*y.T) + A calls + * @{ + */ +template +cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, + const T *x, int incx, const T *y, int incy, T *A, + int lda, cudaStream_t stream); +template <> +inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, + const float *alpha, const float *x, int incx, + const float *y, int incy, float *A, int lda, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); +} + +template <> +inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, + const double *alpha, const double *x, int incx, + const double *y, int incy, double *A, int lda, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); +} +/** @} */ + +/** + * @defgroup gemm cublas gemm calls + * @{ + */ +template +cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, + cublasOperation_t transB, int m, int n, int k, + const T *alfa, const T *A, int lda, const T *B, + int ldb, const T *beta, T *C, int ldc, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, int m, int n, int k, + const float *alfa, const float *A, int lda, + const float *B, int ldb, const float *beta, + float *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +inline cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, int m, int n, int k, + const double *alfa, const double *A, int lda, + const double *B, int ldb, const double *beta, + double *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, + beta, C, ldc); +} +/** @} */ + +/** + * @defgroup gemmbatched cublas gemmbatched calls + * @{ + */ +template +cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const T *alpha, + const T *const Aarray[], // NOLINT + int lda, const T *const Barray[], // NOLINT + int ldb, const T *beta, + T *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgemmBatched( // NOLINT + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const float *alpha, + const float *const Aarray[], // NOLINT + int lda, const float *const Barray[], // NOLINT + int ldb, const float *beta, float *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); +} + +template <> +inline cublasStatus_t cublasgemmBatched( // NOLINT + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const double *alpha, + const double *const Aarray[], // NOLINT + int lda, const double *const Barray[], // NOLINT + int ldb, const double *beta, double *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); +} +/** @} */ + +/** + * @defgroup gemmbatched cublas gemmbatched calls + * @{ + */ +template +cublasStatus_t cublasgemmStridedBatched( // NOLINT + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const T *alpha, const T *const Aarray, int lda, + int64_t strideA, const T *const Barray, int ldb, int64_t strideB, + const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgemmStridedBatched( // NOLINT + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const float *alpha, const float *const Aarray, int lda, + int64_t strideA, const float *const Barray, int ldb, int64_t strideB, + const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, + Aarray, lda, strideA, Barray, ldb, strideB, + beta, Carray, ldc, strideC, batchCount); +} + +template <> +inline cublasStatus_t cublasgemmStridedBatched( // NOLINT + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const double *alpha, const double *const Aarray, int lda, + int64_t strideA, const double *const Barray, int ldb, int64_t strideB, + const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, + Aarray, lda, strideA, Barray, ldb, strideB, + beta, Carray, ldc, strideC, batchCount); +} +/** @} */ + +/** + * @defgroup solverbatched cublas getrf/gettribatched calls + * @{ + */ + +template +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT + T *const A[], // NOLINT + int lda, int *P, int *info, int batchSize, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, float *const A[], // NOLINT + int lda, int *P, int *info, + int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); +} + +template <> +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, double *const A[], // NOLINT + int lda, int *P, int *info, + int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); +} + +template +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT + const T *const A[], // NOLINT + int lda, const int *P, + T *const C[], // NOLINT + int ldc, int *info, int batchSize, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, int n, const float *const A[], // NOLINT + int lda, const int *P, float *const C[], // NOLINT + int ldc, int *info, int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); +} + +template <> +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, int n, const double *const A[], // NOLINT + int lda, const int *P, double *const C[], // NOLINT + int ldc, int *info, int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); +} + +/** @} */ + +/** + * @defgroup gelsbatched cublas gelsbatched calls + * @{ + */ + +template +inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT + cublasOperation_t trans, int m, int n, + int nrhs, T *Aarray[], // NOLINT + int lda, T *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT + cublasOperation_t trans, int m, int n, + int nrhs, float *Aarray[], // NOLINT + int lda, float *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + info, devInfoArray, batchSize); +} + +template <> +inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT + cublasOperation_t trans, int m, int n, + int nrhs, double *Aarray[], // NOLINT + int lda, double *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + info, devInfoArray, batchSize); +} + +/** @} */ + +/** + * @defgroup geam cublas geam calls + * @{ + */ +template +cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, + cublasOperation_t transB, int m, int n, const T *alfa, + const T *A, int lda, const T *beta, const T *B, + int ldb, T *C, int ldc, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, int m, int n, + const float *alfa, const float *A, int lda, + const float *beta, const float *B, int ldb, + float *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, + C, ldc); +} + +template <> +inline cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, int m, int n, + const double *alfa, const double *A, int lda, + const double *beta, const double *B, int ldb, + double *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, + C, ldc); +} +/** @} */ + +/** + * @defgroup symm cublas symm calls + * @{ + */ +template +cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, const T *alpha, + const T *A, int lda, const T *B, int ldb, + const T *beta, T *C, int ldc, cudaStream_t stream); + +template <> +inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, + const float *alpha, const float *A, int lda, + const float *B, int ldb, const float *beta, + float *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, + ldc); +} + +template <> +inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, + const double *alpha, const double *A, int lda, + const double *B, int ldb, const double *beta, + double *C, int ldc, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, + ldc); +} +/** @} */ + +/** + * @defgroup syrk cublas syrk calls + * @{ + */ +template +cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, const T *alpha, + const T *A, int lda, const T *beta, T *C, int ldc, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, + const float *alpha, const float *A, int lda, + const float *beta, float *C, int ldc, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); +} + +template <> +inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, + const double *alpha, const double *A, int lda, + const double *beta, double *C, int ldc, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); +} +/** @} */ + +/** + * @defgroup nrm2 cublas nrm2 calls + * @{ + */ +template +cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, + T *result, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, + int incx, float *result, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSnrm2(handle, n, x, incx, result); +} + +template <> +inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, + int incx, double *result, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDnrm2(handle, n, x, incx, result); +} +/** @} */ + +template +cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, const T *alpha, + const T *A, int lda, T *B, int ldb, + cudaStream_t stream); + +template <> +inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const float *alpha, const float *A, int lda, + float *B, int ldb, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, + ldb); +} + +template <> +inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const double *alpha, const double *A, int lda, + double *B, int ldb, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, + ldb); +} + +/** + * @defgroup dot cublas dot calls + * @{ + */ +template +cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, + const T *y, int incy, T *result, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, + int incx, const float *y, int incy, + float *result, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSdot(handle, n, x, incx, y, incy, result); +} + +template <> +inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, + int incx, const double *y, int incy, + double *result, cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDdot(handle, n, x, incx, y, incy, result); +} +/** @} */ + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h new file mode 100644 index 0000000000..92ba1a2194 --- /dev/null +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -0,0 +1,687 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +///@todo: enable this once logging is enabled +//#include +#include + +namespace raft { +namespace linalg { + +#define _CUSOLVER_ERR_TO_STR(err) \ + case err: \ + return #err; +inline const char *cusolver_error_to_string(cusolverStatus_t err) { + switch (err) { + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); + _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); + default: + return "CUSOLVER_STATUS_UNKNOWN"; + }; +} +#undef _CUSOLVER_ERR_TO_STR + +/** check for cusolver runtime API errors and assert accordingly */ +#define CUSOLVER_CHECK(call) \ + do { \ + cusolverStatus_t err = call; \ + ASSERT(err == CUSOLVER_STATUS_SUCCESS, \ + "CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ + raft::linalg::cusolver_error_to_string(err)); \ + } while (0) + +///@todo: enable this once logging is enabled +// /** check for cusolver runtime API errors but do not assert */ +// #define CUSOLVER_CHECK_NO_THROW(call) \ +// do { \ +// cusolverStatus_t err = call; \ +// if (err != CUSOLVER_STATUS_SUCCESS) { \ +// CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ +// raft::linalg::cusolver_error_to_string(err)); \ +// } \ +// } while (0) + +/** + * @defgroup Getrf cusolver getrf operations + * @{ + */ +template +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT + int n, T *A, int lda, T *Workspace, + int *devIpiv, int *devInfo, + cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT + int m, int n, float *A, int lda, + float *Workspace, int *devIpiv, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); +} + +template <> +inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT + int m, int n, double *A, int lda, + double *Workspace, int *devIpiv, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); +} + +template +cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + +template <> +inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); +} + +template <> +inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); +} + +/** + * @defgroup Getrs cusolver getrs operations + * @{ + */ +template +cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT + cublasOperation_t trans, int n, int nrhs, + const T *A, int lda, const int *devIpiv, T *B, + int ldb, int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT + cublasOperation_t trans, int n, + int nrhs, const float *A, int lda, + const int *devIpiv, float *B, int ldb, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, + devInfo); +} + +template <> +inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT + cublasOperation_t trans, int n, + int nrhs, const double *A, int lda, + const int *devIpiv, double *B, int ldb, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, + devInfo); +} +/** @} */ + +/** + * @defgroup syevd cusolver syevd operations + * @{ + */ +template +cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const T *A, int lda, const T *W, int *lwork); + +template <> +inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const float *A, int lda, const float *W, int *lwork) { + return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); +} + +template <> +inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const double *A, int lda, const double *W, int *lwork) { + return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); +} +/** @} */ + +/** + * @defgroup syevj cusolver syevj operations + * @{ + */ +template +cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT + cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, T *A, int lda, T *W, T *work, int lwork, + int *info, syevjInfo_t params, + cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnsyevj( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, float *A, int lda, float *W, float *work, int lwork, int *info, + syevjInfo_t params, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, + params); +} + +template <> +inline cusolverStatus_t cusolverDnsyevj( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, double *A, int lda, double *W, double *work, int lwork, int *info, + syevjInfo_t params, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, + params); +} + +template +cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); + +template <> +inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const float *A, int lda, const float *W, int *lwork, + syevjInfo_t params) { + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, + params); +} + +template <> +inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const double *A, int lda, const double *W, int *lwork, + syevjInfo_t params) { + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, + params); +} +/** @} */ + +/** + * @defgroup syevd cusolver syevd operations + * @{ + */ +template +cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT + cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, T *A, int lda, T *W, T *work, int lwork, + int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT + cusolverEigMode_t jobz, + cublasFillMode_t uplo, int n, float *A, + int lda, float *W, float *work, + int lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, + devInfo); +} + +template <> +inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT + cusolverEigMode_t jobz, + cublasFillMode_t uplo, int n, double *A, + int lda, double *W, double *work, + int lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, + devInfo); +} +/** @} */ + +#if CUDART_VERSION >= 10010 +/** + * @defgroup syevdx cusolver syevdx operations + * @{ +*/ +template +cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, + int *h_meig, const T *W, int *lwork); + +template <> +inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, + int il, int iu, int *h_meig, const float *W, int *lwork) { + return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, + vu, il, iu, h_meig, W, lwork); +} + +template <> +inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, + int il, int iu, int *h_meig, const double *W, int *lwork) { + return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, + vu, il, iu, h_meig, W, lwork); +} + +template +cusolverStatus_t cusolverDnsyevdx( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, + int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnsyevdx( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, + int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, + h_meig, W, work, lwork, devInfo); +} + +template <> +inline cusolverStatus_t cusolverDnsyevdx( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, + int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, + h_meig, W, work, lwork, devInfo); +} +/** @} */ +#endif + +/** + * @defgroup svd cusolver svd operations + * @{ + */ +template +cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, int *lwork) { + if (typeid(T) == typeid(float)) { + return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); + } else { + return cusolverDnDgesvd_bufferSize(handle, m, n, lwork); + } +} +template +cusolverStatus_t cusolverDngesvd( // NOLINT + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, + T *rwork, int *devInfo, cudaStream_t stream); +template <> +inline cusolverStatus_t cusolverDngesvd( // NOLINT + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, + float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, + ldvt, work, lwork, rwork, devInfo); +} +template <> +inline cusolverStatus_t cusolverDngesvd( // NOLINT + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, + double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, + ldvt, work, lwork, rwork, devInfo); +} + +template +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, + int *lwork, gesvdjInfo_t params); +template <> +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const float *A, int lda, const float *S, const float *U, int ldu, + const float *V, int ldv, int *lwork, gesvdjInfo_t params) { + return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, + ldu, V, ldv, lwork, params); +} +template <> +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const double *A, int lda, const double *S, const double *U, int ldu, + const double *V, int ldv, int *lwork, gesvdjInfo_t params) { + return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, + ldu, V, ldv, lwork, params); +} +template +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, + int *info, gesvdjInfo_t params, cudaStream_t stream); +template <> +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, + float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, + work, lwork, info, params); +} +template <> +inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, + double *work, int lwork, int *info, gesvdjInfo_t params, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, + work, lwork, info, params); +} +/** @} */ + +/** + * @defgroup potrf cusolver potrf operations + * @{ + */ +template +cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, + int *Lwork); + +template <> +inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, + int *Lwork) { + return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); +} + +template <> +inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, + int *Lwork) { + return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); +} + +template +inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, T *A, + int lda, T *Workspace, int Lwork, + int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, float *A, + int lda, float *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); +} + +template <> +inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, double *A, + int lda, double *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); +} +/** @} */ + +/** + * @defgroup potrs cusolver potrs operations + * @{ + */ +template +cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, int nrhs, + const T *A, int lda, T *B, int ldb, + int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, int nrhs, + const float *A, int lda, float *B, + int ldb, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); +} + +template <> +inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT + cublasFillMode_t uplo, int n, int nrhs, + const double *A, int lda, double *B, + int ldb, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); +} +/** @} */ + +/** + * @defgroup geqrf cusolver geqrf operations + * @{ + */ +template +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT + int n, T *A, int lda, T *TAU, T *Workspace, + int Lwork, int *devInfo, cudaStream_t stream); +template <> +inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT + int m, int n, float *A, int lda, + float *TAU, float *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); +} +template <> +inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT + int m, int n, double *A, int lda, + double *TAU, double *Workspace, + int Lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); +} + +template +cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); +template <> +inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); +} +template <> +inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); +} +/** @} */ + +/** + * @defgroup orgqr cusolver orgqr operations + * @{ + */ +template +cusolverStatus_t cusolverDnorgqr( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, + T *work, int lwork, int *devInfo, cudaStream_t stream); +template <> +inline cusolverStatus_t cusolverDnorgqr( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, + const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); +} +template <> +inline cusolverStatus_t cusolverDnorgqr( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, + const double *tau, double *work, int lwork, int *devInfo, + cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); +} + +template +cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, + const T *TAU, int *lwork); +template <> +inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, + const float *TAU, int *lwork) { + return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); +} +template <> +inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, + const double *TAU, int *lwork) { + return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); +} +/** @} */ + +/** + * @defgroup ormqr cusolver ormqr operations + * @{ + */ +template +cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT + cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const T *A, int lda, + const T *tau, T *C, int ldc, T *work, + int lwork, int *devInfo, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverDnormqr( // NOLINT + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const float *A, int lda, const float *tau, float *C, + int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, + work, lwork, devInfo); +} + +template <> +inline cusolverStatus_t cusolverDnormqr( // NOLINT + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const double *A, int lda, const double *tau, double *C, + int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, + work, lwork, devInfo); +} + +template +cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, + int *lwork); + +template <> +inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const float *A, int lda, const float *tau, + const float *C, int ldc, int *lwork) { + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, + C, ldc, lwork); +} + +template <> +inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const double *A, int lda, const double *tau, + const double *C, int ldc, int *lwork) { + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, + C, ldc, lwork); +} +/** @} */ + +/** + * @defgroup csrqrBatched cusolver batched + * @{ + */ +template +cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes); + +template <> +inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes) { + return cusolverSpScsrqrBufferInfoBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, + info, internalDataInBytes, workspaceInBytes); +} + +template <> +inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes) { + return cusolverSpDcsrqrBufferInfoBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, + info, internalDataInBytes, workspaceInBytes); +} + +template +cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, + void *pBuffer, cudaStream_t stream); + +template <> +inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const float *b, float *x, int batchSize, + csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); + return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, + csrRowPtrA, csrColIndA, b, x, batchSize, + info, pBuffer); +} + +template <> +inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const double *b, double *x, int batchSize, + csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); + return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, + csrRowPtrA, csrColIndA, b, x, batchSize, + info, pBuffer); +} +/** @} */ + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp new file mode 100644 index 0000000000..707b71d468 --- /dev/null +++ b/cpp/include/raft/mr/allocator.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace mr { + +/** + * @brief Interface for an asynchronous device/host allocator. + * + * An implementation of this interface can make the following assumptions: + * - It does not need to be but it can allow async allocate and deallocate. + * + * @note This interface does NOT support RAII. Thus, if you need RAII-enabled + * interface, better to use `device_buffer` or `host_buffer`. + */ +class base_allocator { + public: + /** + * @brief Asynchronously allocates a memory region. + * + * An implementation of this need to return a allocation of n bytes properly + * align bytes on the configured device. The allocation can optionally be + * asynchronous in the sense that it is only save to use after all work + * submitted to the passed in stream prior to the call to allocate has + * completed. If the allocation is used before, e.g. in another stream the + * behaviour may be undefined. + * @todo: Add alignment requirments. + * + * @param[in] n number of bytes to allocate + * @param[in] stream stream to issue the possible asynchronous allocation in + */ + virtual void* allocate(std::size_t n, cudaStream_t stream) = 0; + + /** + * @brief Asynchronously deallocates device memory + * + * An implementation of this need to ensure that the allocation that the + * passed in pointer points to remains usable until all work sheduled in + * stream prior to the call to deallocate has completed. + * + * @param[inout] p pointer to the buffer to deallocte + * @param[in] n size of the buffer to deallocte in bytes + * @param[in] stream stream in which the allocation might be still in use + */ + virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0; + + virtual ~base_allocator() = default; +}; // class base_allocator + +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp new file mode 100644 index 0000000000..f1d74d4b24 --- /dev/null +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace mr { + +/** + * @brief Base for all RAII-based owning of temporary memory allocations. This + * class should ideally not be used by users directly, but instead via + * the child classes `device_buffer` and `host_buffer`. + * + * @tparam T data type + * @tparam AllocatorT The underly allocator object + */ +template +class buffer_base { + public: + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; + using const_reference = const T&; + + buffer_base() = delete; + + buffer_base(const buffer_base& other) = delete; + + buffer_base& operator=(const buffer_base& other) = delete; + + /** + * @brief Main ctor + * + * @param[in] allocator asynchronous allocator used for managing buffer life + * @param[in] stream cuda stream where this allocation operations are async + * @param[in] n size of the buffer (in number of elements) + */ + buffer_base(std::shared_ptr allocator, cudaStream_t stream, + size_type n = 0) + : size_(n), + capacity_(n), + data_(nullptr), + stream_(stream), + allocator_(std::move(allocator)) { + if (capacity_ > 0) { + data_ = static_cast( + allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); + } + } + + ~buffer_base() { release(); } + + value_type* data() { return data_; } + + const value_type* data() const { return data_; } + + size_type size() const { return size_; } + + void clear() { size_ = 0; } + + iterator begin() { return data_; } + + const_iterator begin() const { return data_; } + + iterator end() { return data_ + size_; } + + const_iterator end() const { return data_ + size_; } + + /** + * @brief Reserve new memory size for this buffer. + * + * It re-allocates a fresh buffer if the new requested capacity is more than + * the current one, copies the old buffer contents to this new buffer and + * removes the old one. + * + * @param[in] new_capacity new capacity (in number of elements) + * @param[in] stream cuda stream where allocation operations are queued + * @{ + */ + void reserve(size_type new_capacity) { + if (new_capacity > capacity_) { + auto* new_data = static_cast( + allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { + raft::copy(new_data, data_, size_, stream_); + allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); + } + data_ = new_data; + capacity_ = new_capacity; + } + } + + void reserve(size_type new_capacity, cudaStream_t stream) { + set_stream(stream); + reserve(new_capacity); + } + /** @} */ + + /** + * @brief Resize the underlying buffer (uses `reserve` method internally) + * + * @param[in] new_size new buffer size + * @param[in] stream cuda stream where the work will be queued + * @{ + */ + void resize(const size_type new_size) { + reserve(new_size); + size_ = new_size; + } + + void resize(const size_type new_size, cudaStream_t stream) { + set_stream(stream); + resize(new_size); + } + /** @} */ + + /** + * @brief Deletes the underlying buffer + * + * If this method is not explicitly called, it will be during the destructor + * + * @param[in] stream cuda stream where the work will be queued + * @{ + */ + void release() { + if (nullptr != data_) { + allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); + } + data_ = nullptr; + capacity_ = 0; + size_ = 0; + } + + void release(cudaStream_t stream) { + set_stream(stream); + release(); + } + /** @} */ + + /** + * @brief returns the underlying allocator used + * + * @return the allocator pointer + */ + std::shared_ptr get_allocator() const { return allocator_; } + + /** + * @brief returns the underlying stream used + * + * @return the cuda stream + */ + cudaStream_t get_stream() const { return stream_; } + + protected: + value_type* data_; + + private: + size_type size_; + size_type capacity_; + cudaStream_t stream_; + std::shared_ptr allocator_; + + /** + * @brief Sets a new cuda stream where the future operations will be queued + * + * This method makes sure that the inter-stream dependencies are met and taken + * care of, before setting the input stream as a new stream for this buffer. + * Ideally, the same cuda stream passed during constructor is expected to be + * used throughout this buffer's lifetime, for performance. + * + * @param[in] stream new cuda stream to be set. If it is the same as the + * current one, then this method will be a no-op. + */ + void set_stream(cudaStream_t stream) { + if (stream_ != stream) { + cudaEvent_t event; + CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + CUDA_CHECK(cudaEventRecord(event, stream_)); + CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); + stream_ = stream; + CUDA_CHECK(cudaEventDestroy(event)); + } + } +}; // class buffer_base + +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp new file mode 100644 index 0000000000..be6ea6fc67 --- /dev/null +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace mr { +namespace device { + +/** + * @brief An explicit interface for an asynchronous device allocator. + * + * This is mostly done in order to reduce work needed in cuML codebase. + * An implementation of this interface can make the following assumptions, + * further to the ones listed in `Allocator`: + * - Allocations may be always on the device that was specified on construction. + */ +class allocator : public base_allocator {}; + +/** Default device allocator based on the one provided by RMM */ +class default_allocator : public allocator { + public: + void* allocate(std::size_t n, cudaStream_t stream) override { + void* ptr = rmm::mr::get_default_resource()->allocate(n, stream); + return ptr; + } + + void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + rmm::mr::get_default_resource()->deallocate(p, n, stream); + } +}; // class default_allocator + +}; // namespace device +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp new file mode 100644 index 0000000000..39b5674ce4 --- /dev/null +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include "allocator.hpp" + +namespace raft { +namespace mr { +namespace device { + +/** + * @brief RAII object owning a contiguous typed device buffer. The passed in + * allocator supports asynchronous allocation and deallocation so this + * can also be used for temporary memory + * + * @code{.cpp} + * template + * void foo(..., cudaStream_t stream) { + * ... + * raft::mr::device::buffer temp(stream, 0); + * ... + * temp.resize(n); + * kernelA<<>>(...,temp.data(),...); + * kernelB<<>>(...,temp.data(),...); + * temp.release(); + * ... + * } + * @endcode + */ +template +class buffer : public buffer_base { + public: + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; + using const_reference = typename buffer_base::const_reference; + + buffer() = delete; + + buffer(const buffer& other) = delete; + + buffer& operator=(const buffer& other) = delete; + + buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) + : buffer_base(alloc, stream, n) {} +}; // class buffer + +}; // namespace device +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp new file mode 100644 index 0000000000..9ad6ea7532 --- /dev/null +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace mr { +namespace host { + +/** + * @brief An explicit interface for an asynchronous host allocations. + * + * This is mostly done in order to reduce work needed in cuML codebase. + * An implementation of this interface can make the following assumptions, + * further to the ones listed in `Allocator`: + * - Allocations don't need to be zero copy accessible form a device. + */ +class allocator : public base_allocator {}; + +/** Default cudaMallocHost/cudaFreeHost based host allocator */ +class default_allocator : public allocator { + public: + void* allocate(std::size_t n, cudaStream_t stream) override { + void* ptr = nullptr; + CUDA_CHECK(cudaMallocHost(&ptr, n)); + return ptr; + } + + void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + ///@todo: enable this once logging is enabled in raft + //CUDA_CHECK_NO_THROW(cudaFreeHost(p)); + CUDA_CHECK(cudaFreeHost(p)); + } +}; // class default_allocator + +}; // namespace host +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp new file mode 100644 index 0000000000..c26617e072 --- /dev/null +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include "allocator.hpp" + +namespace raft { +namespace mr { +namespace host { + +/** + * @brief RAII object owning a contigous typed host buffer (aka pinned memory). + * The passed in allocator supports asynchronus allocation and + * deallocation so this can also be used for temporary memory + * + * @code{.cpp} + * template + * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) { + * ... + * raft::mr::host::buffer temp(stream, 0); + * ... + * temp.resize(n); + * raft::copy(temp.data(), in_d, temp.size()); + * ... + * raft::copy(out_d, temp.data(), temp.size()); + * temp.release(stream); + * ... + * } + * @endcode + */ +template +class buffer : public buffer_base { + public: + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; + using const_reference = typename buffer_base::const_reference; + + buffer() = delete; + + buffer(const buffer& other) = delete; + + buffer& operator=(const buffer& other) = delete; + + buffer(std::shared_ptr alloc, const device::buffer& other) + : buffer_base(alloc, other.get_stream(), other.size()) { + if (other.size() > 0) { + raft::copy(data_, other.data(), other.size(), other.get_stream()); + } + } + + buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) + : buffer_base(alloc, stream, n) {} + + reference operator[](size_type pos) { return data_[pos]; } + + const_reference operator[](size_type pos) const { return data_[pos]; } + + private: + using buffer_base::data_; +}; + +}; // namespace host +}; // namespace mr +}; // namespace raft diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h new file mode 100644 index 0000000000..1c63d2348b --- /dev/null +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +///@todo: enable this once logging is enabled +//#include +#include + +namespace raft { +namespace sparse { + +#define _CUSPARSE_ERR_TO_STR(err) \ + case err: \ + return #err; +inline const char* cusparse_error_to_string(cusparseStatus_t err) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 + return cusparseGetErrorString(status); +#else // CUDART_VERSION + switch (err) { + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + default: + return "CUSPARSE_STATUS_UNKNOWN"; + }; +#endif // CUDART_VERSION +} +#undef _CUSPARSE_ERR_TO_STR + +/** check for cusparse runtime API errors and assert accordingly */ +#define CUSPARSE_CHECK(call) \ + do { \ + cusparseStatus_t err = call; \ + ASSERT(err == CUSPARSE_STATUS_SUCCESS, \ + "CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ + raft::sparse::cusparse_error_to_string(err)); \ + } while (0) + +///@todo: enable this once logging is enabled +// /** check for cusparse runtime API errors but do not assert */ +// #define CUSPARSE_CHECK_NO_THROW(call) \ +// do { \ +// cusparseStatus_t err = call; \ +// if (err != CUSPARSE_STATUS_SUCCESS) { \ +// CUML_LOG_ERROR("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ +// raft::sparse::cusparse_error_to_string(err)); \ +// } \ +// } while (0) + +/** + * @defgroup gthr cusparse gather methods + * @{ + */ +template +cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, + T* vals_sorted, int* d_P, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, + const double* vals, double* vals_sorted, + int* d_P, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, + CUSPARSE_INDEX_BASE_ZERO); +} +template <> +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, + const float* vals, float* vals_sorted, + int* d_P, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, + CUSPARSE_INDEX_BASE_ZERO); +} +/** @} */ + +/** + * @defgroup coo2csr cusparse COO to CSR converter methods + * @{ + */ +template +void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, + int m, T* csrRowPtr, cudaStream_t stream); +template <> +inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, + int nnz, int m, int* csrRowPtr, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, + CUSPARSE_INDEX_BASE_ZERO)); +} +/** @} */ + +/** + * @defgroup coosort cusparse coo sort methods + * @{ + */ +template +size_t cusparsecoosort_bufferSizeExt( // NOLINT + cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, + const T* cooCols, cudaStream_t stream); +template <> +inline size_t cusparsecoosort_bufferSizeExt( // NOLINT + cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, + const int* cooCols, cudaStream_t stream) { + size_t val; + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK( + cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + return val; +} + +template +void cusparsecoosortByRow( // NOLINT + cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, + void* pBuffer, cudaStream_t stream); +template <> +inline void cusparsecoosortByRow( // NOLINT + cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, + int* P, void* pBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK( + cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); +} +/** @} */ + +/** + * @defgroup Gemmi cusparse gemmi operations + * @{ + */ +inline cusparseStatus_t cusparsegemmi( + cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, + const float* A, int lda, const float* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const float* beta, float* C, int ldc) { + return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, + cscColPtrB, cscRowIndB, beta, C, ldc); +} +inline cusparseStatus_t cusparsegemmi( + cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, + const double* A, int lda, const double* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const double* beta, double* C, int ldc) { + return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, + cscColPtrB, cscRowIndB, beta, C, ldc); +} +/** @} */ + +}; // namespace sparse +}; // namespace raft diff --git a/cpp/scripts/include_checker.py b/cpp/scripts/include_checker.py index e8e752380e..1ced05e743 100644 --- a/cpp/scripts/include_checker.py +++ b/cpp/scripts/include_checker.py @@ -22,6 +22,7 @@ IncludeRegex = re.compile(r"\s*#include\s*(\S+)") +RemoveComments = re.compile(r"//.*") def parse_args(): @@ -52,6 +53,7 @@ def check_includes_in(src): errs = [] dir = os.path.dirname(src) for line_number, line in enumerate(open(src)): + line = RemoveComments.sub("", line) match = IncludeRegex.search(line) if match is None: continue diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py new file mode 100644 index 0000000000..23260d2f4d --- /dev/null +++ b/cpp/scripts/run-clang-tidy.py @@ -0,0 +1,259 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import sys +import re +import os +import subprocess +import argparse +import json +import multiprocessing as mp + + +EXPECTED_VERSION = "8.0.1" +VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") +GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") +SPACES = re.compile(r"\s+") +SEPARATOR = "-" * 16 + + +def parse_args(): + argparser = argparse.ArgumentParser("Runs clang-tidy on a project") + argparser.add_argument("-cdb", type=str, default="compile_commands.json", + help="Path to cmake-generated compilation database") + argparser.add_argument("-exe", type=str, default="clang-tidy", + help="Path to clang-tidy exe") + argparser.add_argument("-ignore", type=str, default="[.]cu$", + help="Regex used to ignore files from checking") + argparser.add_argument("-select", type=str, default=None, + help="Regex used to select files for checking") + argparser.add_argument("-j", type=int, default=-1, + help="Number of parallel jobs to launch.") + args = argparser.parse_args() + if args.j <= 0: + args.j = mp.cpu_count() + args.ignore_compiled = re.compile(args.ignore) if args.ignore else None + args.select_compiled = re.compile(args.select) if args.select else None + ret = subprocess.check_output("%s --version" % args.exe, shell=True) + ret = ret.decode("utf-8") + version = VERSION_REGEX.search(ret) + if version is None: + raise Exception("Failed to figure out clang-tidy version!") + version = version.group(1) + if version != EXPECTED_VERSION: + raise Exception("clang-tidy exe must be v%s found '%s'" % \ + (EXPECTED_VERSION, version)) + if not os.path.exists(args.cdb): + raise Exception("Compilation database '%s' missing" % args.cdb) + return args + + +def list_all_cmds(cdb): + with open(cdb, "r") as fp: + return json.load(fp) + + +def get_gpu_archs(command): + archs = [] + for loc in range(len(command)): + if command[loc] != "-gencode": + continue + arch_flag = command[loc + 1] + match = GPU_ARCH_REGEX.search(arch_flag) + if match is not None: + archs.append("--cuda-gpu-arch=sm_%s" % match.group(1)) + return archs + + +def get_index(arr, item): + try: + return arr.index(item) + except: + return -1 + + +def remove_item(arr, item): + loc = get_index(arr, item) + if loc >= 0: + del arr[loc] + return loc + + +def remove_item_plus_one(arr, item): + loc = get_index(arr, item) + if loc >= 0: + del arr[loc + 1] + del arr[loc] + return loc + + +def get_clang_includes(exe): + dir = os.getenv("CONDA_PREFIX") + if dir is None: + ret = subprocess.check_output("which %s 2>&1" % exe, shell=True) + ret = ret.decode("utf-8") + dir = os.path.dirname(os.path.dirname(ret)) + header = os.path.join(dir, "include", "ClangHeaders") + return ["-I", header] + + +def get_tidy_args(cmd, exe): + command, file = cmd["command"], cmd["file"] + is_cuda = file.endswith(".cu") + command = re.split(SPACES, command) + # compiler is always clang++! + command[0] = "clang++" + # remove compilation and output targets from the original command + remove_item_plus_one(command, "-c") + remove_item_plus_one(command, "-o") + if is_cuda: + # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..." + archs = get_gpu_archs(command) + command.extend(archs) + while True: + loc = remove_item_plus_one(command, "-gencode") + if loc < 0: + break + # "-x cuda" is the right usage in clang + loc = get_index(command, "-x") + if loc >= 0: + command[loc + 1] = "cuda" + remove_item_plus_one(command, "-ccbin") + remove_item(command, "--expt-extended-lambda") + remove_item(command, "--diag_suppress=unrecognized_gcc_pragma") + command.extend(get_clang_includes(exe)) + return command, is_cuda + + +def run_clang_tidy_command(tidy_cmd): + cmd = " ".join(tidy_cmd) + result = subprocess.run(cmd, check=False, shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + status = result.returncode == 0 + if status: + out = "" + else: + out = "CMD: " + cmd + out += result.stdout.decode("utf-8").rstrip() + return status, out + + +def run_clang_tidy(cmd, args): + command, is_cuda = get_tidy_args(cmd, args.exe) + tidy_cmd = [args.exe, "-header-filter=.*raft/cpp/.*", cmd["file"], "--", ] + tidy_cmd.extend(command) + status = True + out = "" + if is_cuda: + tidy_cmd.append("--cuda-device-only") + tidy_cmd.append(cmd["file"]) + ret, out1 = run_clang_tidy_command(tidy_cmd) + out += out1 + out += "%s" % SEPARATOR + if not ret: + status = ret + tidy_cmd[-2] = "--cuda-host-only" + ret, out1 = run_clang_tidy_command(tidy_cmd) + if not ret: + status = ret + out += out1 + else: + tidy_cmd.append(cmd["file"]) + ret, out1 = run_clang_tidy_command(tidy_cmd) + if not ret: + status = ret + out += out1 + return status, out, cmd["file"] + + +# yikes! global var :( +results = [] +def collect_result(result): + global results + results.append(result) + + +def print_result(passed, stdout, file): + status_str = "PASSED" if passed else "FAILED" + print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR)) + if stdout: + print(stdout) + print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR)) + + +def print_results(): + global results + status = True + for passed, stdout, file in results: + print_result(passed, stdout, file) + if not passed: + status = False + return status + + +# mostly used for debugging purposes +def run_sequential(args, all_files): + status = True + # actual tidy checker + for cmd in all_files: + # skip files that we don't want to look at + if args.ignore_compiled is not None and \ + re.search(args.ignore_compiled, cmd["file"]) is not None: + continue + if args.select_compiled is not None and \ + re.search(args.select_compiled, cmd["file"]) is None: + continue + passed, stdout, file = run_clang_tidy(cmd, args) + print_result(passed, stdout, file) + if not passed: + status = False + return status + + +def run_parallel(args, all_files): + pool = mp.Pool(args.j) + # actual tidy checker + for cmd in all_files: + # skip files that we don't want to look at + if args.ignore_compiled is not None and \ + re.search(args.ignore_compiled, cmd["file"]) is not None: + continue + if args.select_compiled is not None and \ + re.search(args.select_compiled, cmd["file"]) is None: + continue + pool.apply_async(run_clang_tidy, args=(cmd, args), + callback=collect_result) + pool.close() + pool.join() + return print_results() + + +def main(): + args = parse_args() + # Attempt to making sure that we run this script from root of repo always + if not os.path.exists(".git"): + raise Exception("This needs to always be run from the root of repo") + all_files = list_all_cmds(args.cdb) + if args.j == 1: + status = run_sequential(args, all_files) + else: + status = run_parallel(args, all_files) + if not status: + raise Exception("clang-tidy failed! Refer to the errors above.") + + +if __name__ == "__main__": + main() diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp new file mode 100644 index 0000000000..c14d880efd --- /dev/null +++ b/cpp/test/cudart_utils.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft { + +TEST(Raft, Utils) { + ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); + ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); + ASSERT_THROW(THROW("Should throw!"), exception); + ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr))); +} + +} // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp new file mode 100644 index 0000000000..2c5280199d --- /dev/null +++ b/cpp/test/handle.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft { + +TEST(Raft, HandleDefault) { + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + ASSERT_EQ(nullptr, h.get_stream()); + ASSERT_NE(nullptr, h.get_cublas_handle()); + ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); + ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); + ASSERT_NE(nullptr, h.get_cusparse_handle()); +} + +TEST(Raft, Handle) { + handle_t h(4); + ASSERT_EQ(4, h.get_num_internal_streams()); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + h.set_stream(stream); + ASSERT_EQ(stream, h.get_stream()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +} // namespace raft diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp new file mode 100644 index 0000000000..86aee43ce3 --- /dev/null +++ b/cpp/test/mr/device/buffer.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft { +namespace mr { +namespace device { + +TEST(Raft, DeviceBuffer) { + auto alloc = std::make_shared(); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + // no allocation at construction + buffer buff(alloc, stream); + ASSERT_EQ(0, buff.size()); + // explicit allocation after construction + buff.resize(20, stream); + ASSERT_EQ(20, buff.size()); + // resizing to a smaller buffer size + buff.resize(10, stream); + ASSERT_EQ(10, buff.size()); + // explicit deallocation + buff.release(stream); + ASSERT_EQ(0, buff.size()); + // use these methods without the explicit stream parameter + buff.resize(20); + ASSERT_EQ(20, buff.size()); + buff.resize(10); + ASSERT_EQ(10, buff.size()); + buff.release(); + ASSERT_EQ(0, buff.size()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +} // namespace device +} // namespace mr +} // namespace raft diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp new file mode 100644 index 0000000000..953f65ddfb --- /dev/null +++ b/cpp/test/mr/host/buffer.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +namespace raft { +namespace mr { +namespace host { + +TEST(Raft, HostBuffer) { + auto alloc = std::make_shared(); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + // no allocation at construction + buffer buff(alloc, stream); + ASSERT_EQ(0, buff.size()); + // explicit allocation after construction + buff.resize(20, stream); + ASSERT_EQ(20, buff.size()); + // resizing to a smaller buffer size + buff.resize(10, stream); + ASSERT_EQ(10, buff.size()); + // explicit deallocation + buff.release(stream); + ASSERT_EQ(0, buff.size()); + // use these methods without the explicit stream parameter + buff.resize(20); + ASSERT_EQ(20, buff.size()); + buff.resize(10); + ASSERT_EQ(10, buff.size()); + buff.release(); + ASSERT_EQ(0, buff.size()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +TEST(Raft, DeviceToHostBuffer) { + auto d_alloc = std::make_shared(); + auto h_alloc = std::make_shared(); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + device::buffer d_buff(d_alloc, stream, 32); + CUDA_CHECK( + cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + buffer h_buff(h_alloc, d_buff); + ASSERT_EQ(d_buff.size(), h_buff.size()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +} // namespace host +} // namespace mr +} // namespace raft diff --git a/cpp/test/test.cpp b/cpp/test/test.cpp index 1ca69febca..7477d7d0b5 100644 --- a/cpp/test/test.cpp +++ b/cpp/test/test.cpp @@ -14,10 +14,12 @@ * limitations under the License. */ +#include #include #include -int main() { - std::string result = raft::test_raft(); - std::cout << result; -} +namespace raft { + +TEST(Raft, print) { std::cout << test_raft() << std::endl; } + +} // namespace raft