diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 9975e290beb..f5b9c8b03e9 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -37,5 +37,9 @@ pip install --requirement requirements-utest.txt pip install /opt/daos/lib/daos/python/ +# set high open file limit in the shell to avoid extra warning +sudo prlimit --nofile=1024:262144 --pid $$ +prlimit -n + ./utils/node_local_test.py --max-log-size 1700MiB --dfuse-dir /localhome/jenkins/ \ --log-usage-save nltir.xml --log-usage-export nltr.json all diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 48d2090a5b3..c326d5a0b5f 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -518,6 +518,61 @@ check_grpid(crt_group_id_t grpid) return rc; } +#define CRT_MIN_TCP_FD 131072 + +/** For some providers, we require a file descriptor for every connection + * and some platforms set the soft limit too low meaning and we run out. We can + * set the limit up to the configured max by default to avoid this and warn + * when that isn't possible. + */ +static void +file_limit_bump(void) +{ + int rc; + struct rlimit rlim; + + /* Bump file descriptor limit if low and if possible */ + rc = getrlimit(RLIMIT_NOFILE, &rlim); + if (rc != 0) { + DS_ERROR(errno, "getrlimit() failed. Unable to check file descriptor limit"); + /** Per the man page, this can only fail if rlim is invalid */ + D_ASSERT(0); + return; + } + + if (rlim.rlim_cur >= CRT_MIN_TCP_FD) + return; + + if (rlim.rlim_max < CRT_MIN_TCP_FD) { + if (getuid() != 0) { + D_WARN("File descriptor hard limit should be at least %d, limit is %lu\n", + CRT_MIN_TCP_FD, rlim.rlim_max); + } else { + /** root should be able to change it */ + D_INFO("Super user attempting to update hard file descriptor limit to %d," + " limit was %lu\n", + CRT_MIN_TCP_FD, rlim.rlim_max); + rlim.rlim_max = CRT_MIN_TCP_FD; + } + + if (rlim.rlim_cur >= rlim.rlim_max) + return; + + /* May as well bump it as much as we can */ + } + + rlim.rlim_cur = rlim.rlim_max; + rc = setrlimit(RLIMIT_NOFILE, &rlim); + if (rc != 0) { + DS_ERROR(errno, + "setrlimit() failed. Unable to bump file descriptor" + " limit to value >= %d, limit is %lu", + CRT_MIN_TCP_FD, rlim.rlim_max); + return; + } + D_INFO("Updated soft file descriptor limit to %lu\n", rlim.rlim_max); +} + static void prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt) { @@ -545,6 +600,9 @@ prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt) d_setenv("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd", 0); } + if (prov == CRT_PROV_OFI_TCP || prov == CRT_PROV_OFI_TCP_RXM) + file_limit_bump(); + if (prov == CRT_PROV_OFI_CXI) mrc_enable = 1; diff --git a/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 b/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 index 82687901ed2..18f3ed0d847 100644 --- a/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 +++ b/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 @@ -214,6 +214,7 @@ fi info "Fixing daos_server_helper" run sudo -E "$DAOS_SOURCE_DIR/utils/setup_daos_server_helper.sh" run chmod -x "$DAOS_INSTALL_DIR/bin/daos_server_helper" +run sudo prlimit --nofile=1024:262144 --pid $$ info "Starting NLT tests" pushd "$DAOS_SOURCE_DIR" > /dev/null diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 6ace1015df3..e5bd17ba938 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -25,6 +25,7 @@ import pwd import random import re +import resource import shutil import signal import stat @@ -6501,6 +6502,12 @@ def main(): parser.add_argument('mode', nargs='*') args = parser.parse_args() + # valgrind reduces the hard limit unless we bump the soft limit first + if args.memcheck != "no": + (soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE) + if soft < hard: + resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard)) + if args.server_fi: server_fi(args) return