From 47a0d339edb624124225a04b9f35981887c7a8bc Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Tue, 5 Nov 2024 09:37:51 -0600 Subject: [PATCH] DAOS-16365 client: intercept dlsym() and zeInit() to avoid nested call (#14932) libfabric loads libze_loader.so which calls zeInit(). We observed deadlock due to nested calls when daos_init() is called inside zeInit(). We intercept dlsym() and zeInit() to avoid calling daos_init() inside zeInit(). dlsym(RTLD_NEXT, ) checks returning address to determine caller's module. To maintain expected behavior of dlsym(RTLD_NEXT, ) with our interception, new_dlsym() is implemented with assembly code to use jmp instruction instead of call. dlsym() has been moved from libdl.so to libc.so since version 2.34. Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/hook.c | 49 +++++-- src/client/dfuse/pil4dfs/hook.h | 6 + src/client/dfuse/pil4dfs/int_dfs.c | 189 ++++++++++++++++++++++++- src/client/dfuse/pil4dfs/pil4dfs_int.h | 2 +- 4 files changed, 226 insertions(+), 20 deletions(-) diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c index 0ec1a0b5374..c30061387c1 100644 --- a/src/client/dfuse/pil4dfs/hook.c +++ b/src/client/dfuse/pil4dfs/hook.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "hook.h" #include "hook_int.h" @@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB]; /* List of names of loaded libraries */ static char **lib_name_list; +/* libc version number in current process. e.g., 2.28 */ +static float libc_version; +static char *libc_version_str; + /* end to compile list of memory blocks in /proc/pid/maps */ static char *path_ld; static char *path_libc; +static char *path_libdl; static char *path_libpthread; /* This holds the path of libpil4dfs.so. It is needed when we want to * force child processes append libpil4dfs.so to env LD_PRELOAD. */ @@ -212,7 +218,7 @@ determine_lib_path(void) { int path_offset = 0, read_size, i, rc; char *read_buff_map = NULL; - char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL; + char *pos, *start, *end, *lib_dir_str = NULL; read_size = read_map_file(&read_buff_map); @@ -289,19 +295,17 @@ determine_lib_path(void) goto err; path_libc[end - start] = 0; - pos = strstr(path_libc, "libc-2."); - if (pos) { - /* containing version in name. example, 2.17 */ - memcpy(lib_ver_str, pos + 5, 4); - lib_ver_str[4] = 0; + if (libc_version_str == NULL) { + libc_version_str = (char *)gnu_get_libc_version(); + if (libc_version_str == NULL) { + DS_ERROR(errno, "Failed to determine libc version"); + goto err; + } + libc_version = atof(libc_version_str); } - if (lib_ver_str[0]) { - /* with version in name */ - rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str); - } else { - rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str); - } + /* with version in name */ + rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str); if (rc < 0) { DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread"); goto err_1; @@ -311,7 +315,18 @@ determine_lib_path(void) path_libpthread = NULL; DS_ERROR(ENAMETOOLONG, "path_libpthread is too long"); goto err_1; - } + } + rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str); + if (rc < 0) { + DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl"); + goto err_1; + } + if (rc >= PATH_MAX) { + free(path_libdl); + path_libdl = NULL; + DS_ERROR(ENAMETOOLONG, "path_libdl is too long"); + goto err_1; + } D_FREE(lib_dir_str); pos = strstr(read_buff_map, "libpil4dfs.so"); @@ -348,6 +363,11 @@ query_pil4dfs_path(void) return path_libpil4dfs; } +float +query_libc_version(void) +{ + return libc_version; +} /* * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[]. @@ -754,6 +774,7 @@ free_memory_in_hook(void) D_FREE(path_ld); D_FREE(path_libc); D_FREE(module_list); + free(path_libdl); free(path_libpthread); if (lib_name_list) { @@ -1034,6 +1055,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_ module_name_local = path_ld; else if (strncmp(module_name, "libc", 5) == 0) module_name_local = path_libc; + else if (strncmp(module_name, "libdl", 6) == 0) + module_name_local = path_libdl; else if (strncmp(module_name, "libpthread", 11) == 0) module_name_local = path_libpthread; else diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h index 7742faaff53..b686d99ce4e 100644 --- a/src/client/dfuse/pil4dfs/hook.h +++ b/src/client/dfuse/pil4dfs/hook.h @@ -60,4 +60,10 @@ free_memory_in_hook(void); char * query_pil4dfs_path(void); +/** + * return glibc version in current process + */ +float +query_libc_version(void); + #endif diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index b69449f1e3d..84567ac5f2d 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -161,6 +161,7 @@ static long int page_size; #define DAOS_INIT_RUNNING 1 static _Atomic uint64_t mpi_init_count; +static _Atomic int64_t zeInit_count; static long int daos_initing; _Atomic bool d_daos_inited; @@ -471,6 +472,9 @@ static int (*next_tcgetattr)(int fd, void *termios_p); static int (*next_mpi_init)(int *argc, char ***argv); static int (*next_pmpi_init)(int *argc, char ***argv); +static int (*next_ze_init)(int flags); +static void *(*next_dlsym)(void *handle, const char *symbol); +static void *(*new_dlsym)(void *handle, const char *symbol); /* to do!! */ /** @@ -1058,6 +1062,143 @@ PMPI_Init(int *argc, char ***argv) return rc; } +int +zeInit(int flags) +{ + int rc; + + if (next_ze_init == NULL) { + if (d_hook_enabled) + next_ze_init = next_dlsym(RTLD_NEXT, "zeInit"); + else + next_ze_init = dlsym(RTLD_NEXT, "zeInit"); + } + D_ASSERT(next_ze_init != NULL); + atomic_fetch_add_relaxed(&zeInit_count, 1); + rc = next_ze_init(flags); + atomic_fetch_add_relaxed(&zeInit_count, -1); + return rc; +} + +#if defined(__x86_64__) +/* This is used to work around compiling warning and limitations of using asm function. */ +static void * +query_new_dlsym_addr(void *addr) +{ + int i; + + /* assume little endian */ + for (i = 0; i < 64; i++) { + /* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm. + * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi + */ + if (*((int *)(addr + i)) == 0x56579090) { + /* two nop are added for easier positioning. offset +2 here to skip two + * nop and start from the real entry. + */ + return ((void *)(addr + i + 2)); + } + } + return NULL; +} + +_Pragma("GCC diagnostic push") +_Pragma("GCC diagnostic ignored \"-Wunused-function\"") +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") + +_Pragma("GCC push_options") +_Pragma("GCC optimize(\"-O0\")") +static char str_zeinit[] = "zeInit"; + +static int +is_hook_enabled(void) +{ + return (d_hook_enabled ? (1) : (0)); +} + +/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */ +static int +my_strcmp(const char *s1, const char *s2) +{ + return strcmp(s1, s2); +} + +static void * +get_zeinit_addr(void) +{ + return (void *)zeInit; +} + +__attribute__((aligned(16))) static void +new_dlsym_marker(void) +{ +} + +__asm__( + "new_dlsym_asm:\n" + "nop\n" + "nop\n" + "push %rdi\n" + "push %rsi\n" + + "call is_hook_enabled\n" + "test %eax,%eax\n" + "je org_dlsym\n" + + "mov %rsi, %rdi\n" + "lea str_zeinit(%rip), %rsi\n" + "call my_strcmp\n" + "test %eax,%eax\n" + "jne org_dlsym\n" + + "pop %rsi\n" + "pop %rdi\n" + "call *next_dlsym(%rip)\n" + "mov %rax, next_ze_init(%rip)\n" + + "test %eax,%eax\n" + "jne found\n" + "ret\n" + + "found:\n" + "call get_zeinit_addr\n" + "ret\n" + + "org_dlsym:\n" + "pop %rsi\n" + "pop %rdi\n" + "jmp *next_dlsym(%rip)\n" +); +_Pragma("GCC pop_options") +_Pragma("GCC diagnostic pop") + +#else +/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set + * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using + * jump instead of call instruction. + */ +static void * +new_dlsym_c(void *handle, const char *symbol) +{ + if (!d_hook_enabled) + goto org_dlsym; + printf("Inside my dlsym().\n"); + if (strcmp(symbol, "zeInit") != 0) + goto org_dlsym; + + next_ze_init = next_dlsym(handle, symbol); + if (next_ze_init) + /* dlsym() finished successfully, then intercept zeInit() */ + return zeInit; + else + return next_ze_init; + +org_dlsym: + /* Ideally we need to adjust stack and jump to next_dlsym(). */ + return next_dlsym(handle, symbol); +} +#endif + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -1164,6 +1305,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent, goto out_normal; } + /* Check whether zeInit() is running. If yes, pass to the original + * libc functions. Avoid possible zeInit reentrancy/nested call. + */ + + if (atomic_load_relaxed(&zeInit_count) > 0) { + *is_target_path = 0; + goto out_normal; + } + /* daos_init() is expensive to call. We call it only when necessary. */ /* Check whether daos_init() is running. If yes, pass to the original @@ -2034,6 +2184,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char if (!is_target_path) goto org_func; + atomic_fetch_add_relaxed(&num_open, 1); if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) { /* Create a dir is not supported. */ errno = ENOENT; @@ -2061,7 +2212,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* Need to create a fake fd and associate with fd_kernel */ - atomic_fetch_add_relaxed(&num_open, 1); dfs_get_mode(dfs_obj, &mode_query); /* regular file */ @@ -2237,7 +2387,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char return (idx_dirfd + FD_DIR_BASE); } - atomic_fetch_add_relaxed(&num_open, 1); rc = find_next_available_fd(NULL, &idx_fd); if (rc) @@ -6014,7 +6163,7 @@ ioctl(int fd, unsigned long request, ...) va_list arg; void *param; struct dfuse_user_reply *reply; - int fd_directed; + int fd_directed = fd; va_start(arg, request); param = va_arg(arg, void *); @@ -6040,12 +6189,11 @@ ioctl(int fd, unsigned long request, ...) return next_ioctl(fd, request, param); fd_directed = d_get_fd_redirected(fd); - if (fd_directed < FD_FILE_BASE) + if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR))) return next_ioctl(fd, request, param); errno = ENOTSUP; - - return -1; + return (-1); } int @@ -6718,6 +6866,18 @@ check_exe_sh_bash(void) return; } +#define SMALL_DIFF (0.0001) +static int +libc_ver_cmp(float ver_a, float ver_b) +{ + if ((ver_a + SMALL_DIFF) < ver_b) + return (-1); + else if (ver_a > (ver_b + SMALL_DIFF)) + return (1); + else + return (0); +} + static __attribute__((constructor)) void init_myhook(void) { @@ -6725,6 +6885,7 @@ init_myhook(void) char *env_log; int rc; uint64_t eq_count_loc = 0; + float libc_version; umask_old = umask(0); umask(umask_old); @@ -6873,6 +7034,18 @@ init_myhook(void) register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); +#if defined(__x86_64__) + new_dlsym = query_new_dlsym_addr(new_dlsym_marker); +#else + new_dlsym = new_dlsym_c; +#endif + D_ASSERT(new_dlsym != NULL); + libc_version = query_libc_version(); + if (libc_ver_cmp(libc_version, 2.34) < 0) + register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + else + register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + init_fd_dup2_list(); /* Need to check whether current process is bash or not under regular & compatible modes.*/ @@ -6884,6 +7057,10 @@ init_myhook(void) dcache_rec_timeout = 0; install_hook(); + + /* Check it here to minimize the work in function new_dlsym() written in assembly */ + D_ASSERT(next_dlsym != NULL); + d_hook_enabled = 1; hook_enabled_bak = d_hook_enabled; } diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h index a9c54b55555..0693123b51f 100644 --- a/src/client/dfuse/pil4dfs/pil4dfs_int.h +++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h @@ -30,7 +30,7 @@ /* FD_FILE_BASE - The base number of the file descriptor for a directory. * The fd allocate from this lib is always larger than FD_FILE_BASE. */ -#define FD_DIR_BASE (0x40000000) +#define FD_DIR_BASE (FD_FILE_BASE + MAX_OPENED_FILE) /* structure allocated for a FD for a file */ struct file_obj {