Skip to content

Commit

Permalink
Replace spdlog by cudf::logger
Browse files Browse the repository at this point in the history
Signed-off-by: Nghia Truong <[email protected]>
  • Loading branch information
ttnghia committed Dec 11, 2024
1 parent bd8b7e6 commit 9d993f1
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 83 deletions.
1 change: 1 addition & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ target_link_libraries(
-Wl,--whole-archive
${CUDFJNI_LIB}
cudf::cudf
cudf::cudf_logger
nvtx3::nvtx3-cpp
-Wl,--no-whole-archive
${ARROW_LIB}
Expand Down
8 changes: 2 additions & 6 deletions src/main/cpp/faultinj/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,9 +28,5 @@ add_library(
)

target_link_libraries(
cufaultinj PRIVATE spdlog::spdlog_header_only
)

target_link_libraries(
cufaultinj PRIVATE CUDA::cupti_static
cufaultinj PRIVATE CUDA::cupti_static cudf::cudf_logger
)
120 changes: 60 additions & 60 deletions src/main/cpp/faultinj/faultinj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include <cudf/logger.hpp>

#include <cuda.h>

#include <assert.h>
Expand All @@ -28,29 +30,28 @@
#define BOOST_SPIRIT_THREADSAFE
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>
#include <spdlog/spdlog.h>
#include <sys/inotify.h>
#include <sys/time.h>

// Format enums for logging
auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); }
// auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); }

namespace {

#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
spdlog::error("function {} failed with error {}", #call, errstr); \
} \
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
CUDF_LOG_ERROR("function {} failed with error {}", #call, errstr); \
} \
} while (0)

#define PTHREAD_CALL(call) \
do { \
int _status = call; \
if (_status != 0) { spdlog::error("function {} failed with error code {}", #call, _status); } \
#define PTHREAD_CALL(call) \
do { \
int _status = call; \
if (_status != 0) { CUDF_LOG_ERROR("function {} failed with error code {}", #call, _status); } \
} while (0)

typedef enum { FI_TRAP, FI_ASSERT, FI_RETURN_VALUE } FaultInjectionType;
Expand Down Expand Up @@ -89,16 +90,16 @@ void* dynamicReconfig(void* args);

void globalControlInit(void)
{
spdlog::debug("globalControlInit of fault injection");
CUDF_LOG_DEBUG("globalControlInit of fault injection");
globalControl.initialized = 0;
globalControl.subscriber = 0;
globalControl.terminateThread = 0;
spdlog::trace("checking environment {}", configFilePathEnv);
CUDF_LOG_TRACE("checking environment {}", configFilePathEnv);
const char* configFilePath = std::getenv(configFilePathEnv.c_str());
spdlog::debug("{} is {}", configFilePathEnv, configFilePath);
CUDF_LOG_DEBUG("{} is {}", configFilePathEnv, configFilePath);
if (configFilePath) {
globalControl.configFilePath = std::string(configFilePath);
spdlog::debug("will init config from {}", globalControl.configFilePath);
CUDF_LOG_DEBUG("will init config from {}", globalControl.configFilePath);
}
readFaultInjectorConfig();
globalControl.initialized = 1;
Expand All @@ -115,10 +116,10 @@ void atExitHandler(void)
if (globalControl.dynamic) {
globalControl.terminateThread = 1;
PTHREAD_CALL(pthread_join(globalControl.dynamicThread, nullptr));
spdlog::info("reconfig thread shut down ... exiting");
CUDF_LOG_INFO("reconfig thread shut down ... exiting");
}

spdlog::debug("atExitHandler: cuptiFinalize");
CUDF_LOG_DEBUG("atExitHandler: cuptiFinalize");
CUPTI_CALL(cuptiFinalize());
}

Expand Down Expand Up @@ -200,9 +201,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
// case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz:
if (std::string(cbInfo->symbolName)
.compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) {
spdlog::debug("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
break;
}
// intentional fallthrough
Expand All @@ -225,9 +226,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000:
if (std::string(cbInfo->symbolName)
.compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) {
spdlog::debug("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
break;
}
// intentional fallthrough
Expand Down Expand Up @@ -261,7 +262,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
const int interceptionCount =
(*matchedFaultConfig).get_optional<int>(interceptionCountKey).value_or(INT_MAX);

spdlog::trace(
CUDF_LOG_TRACE(
"considered config domain={} function={} injectionType={} probability={} "
"interceptionCount={}",
domain,
Expand All @@ -271,7 +272,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
interceptionCount);

if (interceptionCount <= 0) {
spdlog::trace(
CUDF_LOG_TRACE(
"skipping interception because hit count reached 0, "
"domain={} function={} injectionType={} probability={} "
"interceptionCount={}",
Expand All @@ -287,9 +288,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
if (injectionProbability <= 0) { return; }
const int rand10000 = std::rand() % 10000;
const int skipThreshold = injectionProbability * 10000 / 100;
spdlog::trace("rand1000={} skipThreshold={}", rand10000, skipThreshold);
CUDF_LOG_TRACE("rand1000={} skipThreshold={}", rand10000, skipThreshold);
if (rand10000 >= skipThreshold) { return; }
spdlog::debug(
CUDF_LOG_DEBUG(
"matched config based on rand10000={} skipThreshold={} "
"domain={} function={} injectionType={} probability={}",
rand10000,
Expand All @@ -299,7 +300,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
injectionType,
injectionProbability);
} else {
spdlog::debug(
CUDF_LOG_DEBUG(
"matched 100% config domain={} function={} injectionType={} "
"probability={}",
domain,
Expand All @@ -310,7 +311,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,

// update counter if not unlimited
if (interceptionCount != INT_MAX) {
spdlog::debug("updating interception count {}: before locking", interceptionCount);
CUDF_LOG_DEBUG("updating interception count {}: before locking", interceptionCount);
// TODO the lock is too coarse-grained.
PTHREAD_CALL(pthread_rwlock_wrlock(&globalControl.configLock));
const int interceptionCount = (*matchedFaultConfig).get<int>("interceptionCount");
Expand All @@ -335,7 +336,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
*cuResPtr = static_cast<CUresult>(substituteReturnCode);
} else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
cudaError_t* cudaErrPtr = static_cast<cudaError_t*>(cbInfo->functionReturnValue);
spdlog::error("updating runtime return value DOES NOT WORK, use trap or assert");
CUDF_LOG_ERROR("updating runtime return value DOES NOT WORK, use trap or assert");
*cudaErrPtr = static_cast<cudaError_t>(substituteReturnCode);
break;
}
Expand All @@ -350,17 +351,17 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
void readFaultInjectorConfig(void)
{
if (globalControl.configFilePath.empty()) {
spdlog::error("specify convig via environment {}", configFilePathEnv);
CUDF_LOG_ERROR("specify convig via environment {}", configFilePathEnv);
return;
}
std::ifstream jsonStream(globalControl.configFilePath);
if (!jsonStream.good()) {
spdlog::error("check file exists {}", globalControl.configFilePath);
CUDF_LOG_ERROR("check file exists {}", globalControl.configFilePath);
return;
}

// to retrieve and the numeric value of spdlog:level::level_enum
// https://github.com/gabime/spdlog/blob/d546201f127c306ec8a0082d57562a05a049af77/include/spdlog/common.h#L198-L204
// The numeric value of level_enum is retrieved from
// https://github.com/rapidsai/rapids-logger/blob/main/logger.hpp.in#L40
const std::string logLevelKey = "logLevel";

// A Boolean flag as to whether to watch for config file modifications
Expand Down Expand Up @@ -392,29 +393,28 @@ void readFaultInjectorConfig(void)

const unsigned seed =
globalControl.configRoot.get_optional<unsigned>(seedKey).value_or(std::time(0));
spdlog::info("Seeding std::srand with {}", seed);
CUDF_LOG_INFO("Seeding std::srand with {}", seed);
std::srand(seed);

const spdlog::level::level_enum logLevelEnum = static_cast<spdlog::level::level_enum>(logLevel);
spdlog::info("changed log level to {}", logLevel);
spdlog::set_level(logLevelEnum);
CUDF_LOG_INFO("changed log level to {}", logLevel);
cudf::default_logger().set_level(static_cast<cudf::level_enum>(logLevel));
traceConfig(globalControl.configRoot);

globalControl.driverFaultConfigs = globalControl.configRoot.get_child_optional(driverFaultsKey);
globalControl.runtimeFaultConfigs =
globalControl.configRoot.get_child_optional(runtimeFaultsKey);
} catch (boost::property_tree::json_parser::json_parser_error& error) {
spdlog::error("error parsing fault injector config, still editing? {}", error.what());
CUDF_LOG_ERROR("error parsing fault injector config, still editing? {}", error.what());
}
PTHREAD_CALL(pthread_rwlock_unlock(&globalControl.configLock));
jsonStream.close();
spdlog::debug("readFaultInjectorConfig from {} DONE", globalControl.configFilePath);
CUDF_LOG_DEBUG("readFaultInjectorConfig from {} DONE", globalControl.configFilePath);
}

void traceConfig(boost::property_tree::ptree const& pTree)
{
for (auto it = pTree.begin(); it != pTree.end(); ++it) {
spdlog::trace("congig key={} value={}", it->first, it->second.get_value<std::string>());
CUDF_LOG_TRACE("congig key={} value={}", it->first, it->second.get_value<std::string>());
traceConfig(it->second);
}
}
Expand All @@ -432,17 +432,17 @@ int eventCheck(int fd)

void* dynamicReconfig(void*)
{
spdlog::debug("config watcher thread: inotify_init()");
CUDF_LOG_DEBUG("config watcher thread: inotify_init()");
const int inotifyFd = inotify_init();
if (inotifyFd < 0) {
spdlog::error("inotify_init() failed");
CUDF_LOG_ERROR("inotify_init() failed");
return nullptr;
}
spdlog::debug("config watcher thread: inotify_add_watch {}", globalControl.configFilePath);
CUDF_LOG_DEBUG("config watcher thread: inotify_add_watch {}", globalControl.configFilePath);
const int watchFd = inotify_add_watch(inotifyFd, globalControl.configFilePath.c_str(), IN_MODIFY);
if (watchFd < 0) {
spdlog::error("config watcher thread: inotify_add_watch {} failed",
globalControl.configFilePath);
CUDF_LOG_ERROR("config watcher thread: inotify_add_watch {} failed",
globalControl.configFilePath);
return nullptr;
}

Expand All @@ -454,32 +454,32 @@ void* dynamicReconfig(void*)
char eventBuffer[BUF_LEN];

while (!globalControl.terminateThread) {
spdlog::trace("about to call eventCheck");
CUDF_LOG_TRACE("about to call eventCheck");
const int eventCheckRes = eventCheck(inotifyFd);
spdlog::trace("eventCheck returned {}", eventCheckRes);
CUDF_LOG_TRACE("eventCheck returned {}", eventCheckRes);
if (eventCheckRes > 0) {
const int length = read(inotifyFd, eventBuffer, BUF_LEN);
spdlog::debug("config watcher thread: read {} bytes", length);
CUDF_LOG_DEBUG("config watcher thread: read {} bytes", length);
if (length < EVENT_SIZE) { continue; }
for (int i = 0; i < length;) {
struct inotify_event* event = (struct inotify_event*)&eventBuffer[i];
spdlog::debug("modfiled file detected: {}", event->name);
CUDF_LOG_DEBUG("modfiled file detected: {}", event->name);
i += EVENT_SIZE + event->len;
}
readFaultInjectorConfig();
}
}

if (watchFd >= 0) {
spdlog::debug("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd);
CUDF_LOG_DEBUG("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd);
inotify_rm_watch(inotifyFd, watchFd);
}
if (inotifyFd >= 0) {
spdlog::debug("config watcher thread: close {}", inotifyFd);
CUDF_LOG_DEBUG("config watcher thread: close {}", inotifyFd);
close(inotifyFd);
}
spdlog::info("exiting dynamic reconfig thread: terminateThread={}",
globalControl.terminateThread);
CUDF_LOG_INFO("exiting dynamic reconfig thread: terminateThread={}",
globalControl.terminateThread);
return nullptr;
}

Expand All @@ -490,9 +490,9 @@ void* dynamicReconfig(void*)
*/
extern "C" int InitializeInjection(void)
{
spdlog::info("cuInit entry point for libcufaultinj InitializeInjection");
CUDF_LOG_INFO("cuInit entry point for libcufaultinj InitializeInjection");
// intial log level is trace until the config is read
spdlog::set_level(spdlog::level::trace);
cudf::default_logger().set_level(cudf::level_enum::trace);

if (globalControl.initialized) { return 1; }
// Init globalControl
Expand All @@ -501,7 +501,7 @@ extern "C" int InitializeInjection(void)
registerAtExitHandler();

if (globalControl.dynamic) {
spdlog::debug("creating a thread to watch the fault injector config interactively");
CUDF_LOG_DEBUG("creating a thread to watch the fault injector config interactively");
PTHREAD_CALL(pthread_create(&globalControl.dynamicThread, nullptr, dynamicReconfig, nullptr));
}

Expand Down
Loading

0 comments on commit 9d993f1

Please sign in to comment.