diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index b5543ca10..78f43f13b 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -277,6 +277,7 @@ target_link_libraries( -Wl,--whole-archive ${CUDFJNI_LIB} cudf::cudf + cudf::cudf_logger nvtx3::nvtx3-cpp -Wl,--no-whole-archive ${ARROW_LIB} diff --git a/src/main/cpp/faultinj/CMakeLists.txt b/src/main/cpp/faultinj/CMakeLists.txt index ce4a03f3c..78844728a 100644 --- a/src/main/cpp/faultinj/CMakeLists.txt +++ b/src/main/cpp/faultinj/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,9 +28,5 @@ add_library( ) target_link_libraries( - cufaultinj PRIVATE spdlog::spdlog_header_only -) - -target_link_libraries( - cufaultinj PRIVATE CUDA::cupti_static + cufaultinj PRIVATE CUDA::cupti_static cudf::cudf_logger ) diff --git a/src/main/cpp/faultinj/faultinj.cu b/src/main/cpp/faultinj/faultinj.cu index fcb4b3a12..cf8dd6b29 100644 --- a/src/main/cpp/faultinj/faultinj.cu +++ b/src/main/cpp/faultinj/faultinj.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include @@ -28,29 +30,28 @@ #define BOOST_SPIRIT_THREADSAFE #include #include -#include #include #include // Format enums for logging -auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); } +// auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); } namespace { -#define CUPTI_CALL(call) \ - do { \ - CUptiResult _status = call; \ - if (_status != CUPTI_SUCCESS) { \ - const char* errstr; \ - cuptiGetResultString(_status, &errstr); \ - spdlog::error("function {} failed with error {}", #call, errstr); \ - } \ +#define CUPTI_CALL(call) \ + do { \ + CUptiResult _status = call; \ + if (_status != CUPTI_SUCCESS) { \ + const char* errstr; \ + cuptiGetResultString(_status, &errstr); \ + CUDF_LOG_ERROR("function {} failed with error {}", #call, errstr); \ + } \ } while (0) -#define PTHREAD_CALL(call) \ - do { \ - int _status = call; \ - if (_status != 0) { spdlog::error("function {} failed with error code {}", #call, _status); } \ +#define PTHREAD_CALL(call) \ + do { \ + int _status = call; \ + if (_status != 0) { CUDF_LOG_ERROR("function {} failed with error code {}", #call, _status); } \ } while (0) typedef enum { FI_TRAP, FI_ASSERT, FI_RETURN_VALUE } FaultInjectionType; @@ -89,16 +90,16 @@ void* dynamicReconfig(void* args); void globalControlInit(void) { - spdlog::debug("globalControlInit of fault injection"); + CUDF_LOG_DEBUG("globalControlInit of fault injection"); globalControl.initialized = 0; globalControl.subscriber = 0; globalControl.terminateThread = 0; - spdlog::trace("checking environment {}", configFilePathEnv); + CUDF_LOG_TRACE("checking environment {}", configFilePathEnv); const char* configFilePath = std::getenv(configFilePathEnv.c_str()); - spdlog::debug("{} is {}", configFilePathEnv, configFilePath); + CUDF_LOG_DEBUG("{} is {}", configFilePathEnv, configFilePath); if (configFilePath) { globalControl.configFilePath = std::string(configFilePath); - spdlog::debug("will init config from {}", globalControl.configFilePath); + CUDF_LOG_DEBUG("will init config from {}", globalControl.configFilePath); } readFaultInjectorConfig(); globalControl.initialized = 1; @@ -115,10 +116,10 @@ void atExitHandler(void) if (globalControl.dynamic) { globalControl.terminateThread = 1; PTHREAD_CALL(pthread_join(globalControl.dynamicThread, nullptr)); - spdlog::info("reconfig thread shut down ... exiting"); + CUDF_LOG_INFO("reconfig thread shut down ... exiting"); } - spdlog::debug("atExitHandler: cuptiFinalize"); + CUDF_LOG_DEBUG("atExitHandler: cuptiFinalize"); CUPTI_CALL(cuptiFinalize()); } @@ -200,9 +201,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, // case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz: if (std::string(cbInfo->symbolName) .compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) { - spdlog::debug("rejecting fake launch functionName={} symbol={}", - cbInfo->functionName, - cbInfo->symbolName); + CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}", + cbInfo->functionName, + cbInfo->symbolName); break; } // intentional fallthrough @@ -225,9 +226,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000: if (std::string(cbInfo->symbolName) .compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) { - spdlog::debug("rejecting fake launch functionName={} symbol={}", - cbInfo->functionName, - cbInfo->symbolName); + CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}", + cbInfo->functionName, + cbInfo->symbolName); break; } // intentional fallthrough @@ -261,7 +262,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, const int interceptionCount = (*matchedFaultConfig).get_optional(interceptionCountKey).value_or(INT_MAX); - spdlog::trace( + CUDF_LOG_TRACE( "considered config domain={} function={} injectionType={} probability={} " "interceptionCount={}", domain, @@ -271,7 +272,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, interceptionCount); if (interceptionCount <= 0) { - spdlog::trace( + CUDF_LOG_TRACE( "skipping interception because hit count reached 0, " "domain={} function={} injectionType={} probability={} " "interceptionCount={}", @@ -287,9 +288,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, if (injectionProbability <= 0) { return; } const int rand10000 = std::rand() % 10000; const int skipThreshold = injectionProbability * 10000 / 100; - spdlog::trace("rand1000={} skipThreshold={}", rand10000, skipThreshold); + CUDF_LOG_TRACE("rand1000={} skipThreshold={}", rand10000, skipThreshold); if (rand10000 >= skipThreshold) { return; } - spdlog::debug( + CUDF_LOG_DEBUG( "matched config based on rand10000={} skipThreshold={} " "domain={} function={} injectionType={} probability={}", rand10000, @@ -299,7 +300,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, injectionType, injectionProbability); } else { - spdlog::debug( + CUDF_LOG_DEBUG( "matched 100% config domain={} function={} injectionType={} " "probability={}", domain, @@ -310,7 +311,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, // update counter if not unlimited if (interceptionCount != INT_MAX) { - spdlog::debug("updating interception count {}: before locking", interceptionCount); + CUDF_LOG_DEBUG("updating interception count {}: before locking", interceptionCount); // TODO the lock is too coarse-grained. PTHREAD_CALL(pthread_rwlock_wrlock(&globalControl.configLock)); const int interceptionCount = (*matchedFaultConfig).get("interceptionCount"); @@ -335,7 +336,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, *cuResPtr = static_cast(substituteReturnCode); } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) { cudaError_t* cudaErrPtr = static_cast(cbInfo->functionReturnValue); - spdlog::error("updating runtime return value DOES NOT WORK, use trap or assert"); + CUDF_LOG_ERROR("updating runtime return value DOES NOT WORK, use trap or assert"); *cudaErrPtr = static_cast(substituteReturnCode); break; } @@ -350,17 +351,17 @@ void CUPTIAPI faultInjectionCallbackHandler(void*, void readFaultInjectorConfig(void) { if (globalControl.configFilePath.empty()) { - spdlog::error("specify convig via environment {}", configFilePathEnv); + CUDF_LOG_ERROR("specify convig via environment {}", configFilePathEnv); return; } std::ifstream jsonStream(globalControl.configFilePath); if (!jsonStream.good()) { - spdlog::error("check file exists {}", globalControl.configFilePath); + CUDF_LOG_ERROR("check file exists {}", globalControl.configFilePath); return; } - // to retrieve and the numeric value of spdlog:level::level_enum - // https://github.com/gabime/spdlog/blob/d546201f127c306ec8a0082d57562a05a049af77/include/spdlog/common.h#L198-L204 + // The numeric value of level_enum is retrieved from + // https://github.com/rapidsai/rapids-logger/blob/main/logger.hpp.in#L40 const std::string logLevelKey = "logLevel"; // A Boolean flag as to whether to watch for config file modifications @@ -392,29 +393,28 @@ void readFaultInjectorConfig(void) const unsigned seed = globalControl.configRoot.get_optional(seedKey).value_or(std::time(0)); - spdlog::info("Seeding std::srand with {}", seed); + CUDF_LOG_INFO("Seeding std::srand with {}", seed); std::srand(seed); - const spdlog::level::level_enum logLevelEnum = static_cast(logLevel); - spdlog::info("changed log level to {}", logLevel); - spdlog::set_level(logLevelEnum); + CUDF_LOG_INFO("changed log level to {}", logLevel); + cudf::default_logger().set_level(static_cast(logLevel)); traceConfig(globalControl.configRoot); globalControl.driverFaultConfigs = globalControl.configRoot.get_child_optional(driverFaultsKey); globalControl.runtimeFaultConfigs = globalControl.configRoot.get_child_optional(runtimeFaultsKey); } catch (boost::property_tree::json_parser::json_parser_error& error) { - spdlog::error("error parsing fault injector config, still editing? {}", error.what()); + CUDF_LOG_ERROR("error parsing fault injector config, still editing? {}", error.what()); } PTHREAD_CALL(pthread_rwlock_unlock(&globalControl.configLock)); jsonStream.close(); - spdlog::debug("readFaultInjectorConfig from {} DONE", globalControl.configFilePath); + CUDF_LOG_DEBUG("readFaultInjectorConfig from {} DONE", globalControl.configFilePath); } void traceConfig(boost::property_tree::ptree const& pTree) { for (auto it = pTree.begin(); it != pTree.end(); ++it) { - spdlog::trace("congig key={} value={}", it->first, it->second.get_value()); + CUDF_LOG_TRACE("congig key={} value={}", it->first, it->second.get_value()); traceConfig(it->second); } } @@ -432,17 +432,17 @@ int eventCheck(int fd) void* dynamicReconfig(void*) { - spdlog::debug("config watcher thread: inotify_init()"); + CUDF_LOG_DEBUG("config watcher thread: inotify_init()"); const int inotifyFd = inotify_init(); if (inotifyFd < 0) { - spdlog::error("inotify_init() failed"); + CUDF_LOG_ERROR("inotify_init() failed"); return nullptr; } - spdlog::debug("config watcher thread: inotify_add_watch {}", globalControl.configFilePath); + CUDF_LOG_DEBUG("config watcher thread: inotify_add_watch {}", globalControl.configFilePath); const int watchFd = inotify_add_watch(inotifyFd, globalControl.configFilePath.c_str(), IN_MODIFY); if (watchFd < 0) { - spdlog::error("config watcher thread: inotify_add_watch {} failed", - globalControl.configFilePath); + CUDF_LOG_ERROR("config watcher thread: inotify_add_watch {} failed", + globalControl.configFilePath); return nullptr; } @@ -454,16 +454,16 @@ void* dynamicReconfig(void*) char eventBuffer[BUF_LEN]; while (!globalControl.terminateThread) { - spdlog::trace("about to call eventCheck"); + CUDF_LOG_TRACE("about to call eventCheck"); const int eventCheckRes = eventCheck(inotifyFd); - spdlog::trace("eventCheck returned {}", eventCheckRes); + CUDF_LOG_TRACE("eventCheck returned {}", eventCheckRes); if (eventCheckRes > 0) { const int length = read(inotifyFd, eventBuffer, BUF_LEN); - spdlog::debug("config watcher thread: read {} bytes", length); + CUDF_LOG_DEBUG("config watcher thread: read {} bytes", length); if (length < EVENT_SIZE) { continue; } for (int i = 0; i < length;) { struct inotify_event* event = (struct inotify_event*)&eventBuffer[i]; - spdlog::debug("modfiled file detected: {}", event->name); + CUDF_LOG_DEBUG("modfiled file detected: {}", event->name); i += EVENT_SIZE + event->len; } readFaultInjectorConfig(); @@ -471,15 +471,15 @@ void* dynamicReconfig(void*) } if (watchFd >= 0) { - spdlog::debug("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd); + CUDF_LOG_DEBUG("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd); inotify_rm_watch(inotifyFd, watchFd); } if (inotifyFd >= 0) { - spdlog::debug("config watcher thread: close {}", inotifyFd); + CUDF_LOG_DEBUG("config watcher thread: close {}", inotifyFd); close(inotifyFd); } - spdlog::info("exiting dynamic reconfig thread: terminateThread={}", - globalControl.terminateThread); + CUDF_LOG_INFO("exiting dynamic reconfig thread: terminateThread={}", + globalControl.terminateThread); return nullptr; } @@ -490,9 +490,9 @@ void* dynamicReconfig(void*) */ extern "C" int InitializeInjection(void) { - spdlog::info("cuInit entry point for libcufaultinj InitializeInjection"); + CUDF_LOG_INFO("cuInit entry point for libcufaultinj InitializeInjection"); // intial log level is trace until the config is read - spdlog::set_level(spdlog::level::trace); + cudf::default_logger().set_level(cudf::level_enum::trace); if (globalControl.initialized) { return 1; } // Init globalControl @@ -501,7 +501,7 @@ extern "C" int InitializeInjection(void) registerAtExitHandler(); if (globalControl.dynamic) { - spdlog::debug("creating a thread to watch the fault injector config interactively"); + CUDF_LOG_DEBUG("creating a thread to watch the fault injector config interactively"); PTHREAD_CALL(pthread_create(&globalControl.dynamicThread, nullptr, dynamicReconfig, nullptr)); } diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp index 31a603411..e402f8101 100644 --- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp +++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp @@ -14,18 +14,16 @@ * limitations under the License. */ +#include + #include #include #include -#include -#include -#include -#include -#include #include #include +#include #include #include #include @@ -114,23 +112,24 @@ const char* as_str(thread_state state) } } -static std::shared_ptr make_logger(std::ostream& stream) +static std::shared_ptr make_logger(std::ostream& stream) { - return std::make_shared("SPARK_RMM", - std::make_shared(stream)); + return std::make_shared( + "SPARK_RMM", std::vector{std::make_shared(stream)}); } -static std::shared_ptr make_logger() +static std::shared_ptr make_logger() { - return std::make_shared("SPARK_RMM", - std::make_shared()); + return std::make_shared( + "SPARK_RMM", std::vector{std::make_shared()}); } static auto make_logger(std::string const& filename) { - return std::make_shared( + return std::make_shared( "SPARK_RMM", - std::make_shared(filename, true /*truncate file*/)); + std::vector{ + std::make_shared(filename, true /*truncate file*/)}); } /** @@ -394,12 +393,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { public: spark_resource_adaptor(JNIEnv* env, rmm::mr::device_memory_resource* mr, - std::shared_ptr& logger, + std::shared_ptr& logger, bool const is_log_enabled) : resource{mr}, logger{logger}, is_log_enabled{is_log_enabled} { if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); } - logger->flush_on(spdlog::level::info); + logger->flush_on(cudf::level_enum::info); logger->set_pattern("%v"); logger->info("time,op,current thread,op thread,op task,from state,to state,notes"); logger->set_pattern("%H:%M:%S.%f,%v"); @@ -880,7 +879,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { private: rmm::mr::device_memory_resource* const resource; - std::shared_ptr logger; ///< spdlog logger object + std::shared_ptr logger; ///< logger object bool const is_log_enabled; // The state mutex must be held when modifying the state of threads or tasks @@ -1844,7 +1843,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr cudf::jni::auto_set_device(env); auto wrapped = reinterpret_cast(child); cudf::jni::native_jstring nlogloc(env, log_loc); - std::shared_ptr logger; + std::shared_ptr logger; bool is_log_enabled; if (nlogloc.is_null()) { logger = make_logger();