Skip to content

Commit

Permalink
DAOS-16645 cart: Bump file descriptor limit (#15224) (#15366)
Browse files Browse the repository at this point in the history
With tcp provider, using many sockets can cause significant
file descriptor usage.  Bump the soft limit, if possible
and warn if it appears insufficient.
Valgrind sets hard limit to soft limit, so work around that in NLT.

Signed-off-by: Jeff Olivier <[email protected]>
  • Loading branch information
jolivier23 authored Dec 19, 2024
1 parent 142a112 commit bf6a5b4
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 0 deletions.
4 changes: 4 additions & 0 deletions ci/unit/test_nlt_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,9 @@ pip install --requirement requirements-utest.txt

pip install /opt/daos/lib/daos/python/

# set high open file limit in the shell to avoid extra warning
sudo prlimit --nofile=1024:262144 --pid $$
prlimit -n

./utils/node_local_test.py --max-log-size 1700MiB --dfuse-dir /localhome/jenkins/ \
--log-usage-save nltir.xml --log-usage-export nltr.json all
58 changes: 58 additions & 0 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,61 @@ check_grpid(crt_group_id_t grpid)
return rc;
}

#define CRT_MIN_TCP_FD 131072

/** For some providers, we require a file descriptor for every connection
* and some platforms set the soft limit too low meaning and we run out. We can
* set the limit up to the configured max by default to avoid this and warn
* when that isn't possible.
*/
static void
file_limit_bump(void)
{
int rc;
struct rlimit rlim;

/* Bump file descriptor limit if low and if possible */
rc = getrlimit(RLIMIT_NOFILE, &rlim);
if (rc != 0) {
DS_ERROR(errno, "getrlimit() failed. Unable to check file descriptor limit");
/** Per the man page, this can only fail if rlim is invalid */
D_ASSERT(0);
return;
}

if (rlim.rlim_cur >= CRT_MIN_TCP_FD)
return;

if (rlim.rlim_max < CRT_MIN_TCP_FD) {
if (getuid() != 0) {
D_WARN("File descriptor hard limit should be at least %d, limit is %lu\n",
CRT_MIN_TCP_FD, rlim.rlim_max);
} else {
/** root should be able to change it */
D_INFO("Super user attempting to update hard file descriptor limit to %d,"
" limit was %lu\n",
CRT_MIN_TCP_FD, rlim.rlim_max);
rlim.rlim_max = CRT_MIN_TCP_FD;
}

if (rlim.rlim_cur >= rlim.rlim_max)
return;

/* May as well bump it as much as we can */
}

rlim.rlim_cur = rlim.rlim_max;
rc = setrlimit(RLIMIT_NOFILE, &rlim);
if (rc != 0) {
DS_ERROR(errno,
"setrlimit() failed. Unable to bump file descriptor"
" limit to value >= %d, limit is %lu",
CRT_MIN_TCP_FD, rlim.rlim_max);
return;
}
D_INFO("Updated soft file descriptor limit to %lu\n", rlim.rlim_max);
}

static void
prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
{
Expand Down Expand Up @@ -545,6 +600,9 @@ prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
d_setenv("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd", 0);
}

if (prov == CRT_PROV_OFI_TCP || prov == CRT_PROV_OFI_TCP_RXM)
file_limit_bump();

if (prov == CRT_PROV_OFI_CXI)
mrc_enable = 1;

Expand Down
1 change: 1 addition & 0 deletions utils/ansible/ftest/templates/daos-launch_nlt.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ fi
info "Fixing daos_server_helper"
run sudo -E "$DAOS_SOURCE_DIR/utils/setup_daos_server_helper.sh"
run chmod -x "$DAOS_INSTALL_DIR/bin/daos_server_helper"
run sudo prlimit --nofile=1024:262144 --pid $$

info "Starting NLT tests"
pushd "$DAOS_SOURCE_DIR" > /dev/null
Expand Down
7 changes: 7 additions & 0 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import pwd
import random
import re
import resource
import shutil
import signal
import stat
Expand Down Expand Up @@ -6501,6 +6502,12 @@ def main():
parser.add_argument('mode', nargs='*')
args = parser.parse_args()

# valgrind reduces the hard limit unless we bump the soft limit first
if args.memcheck != "no":
(soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
if soft < hard:
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))

if args.server_fi:
server_fi(args)
return
Expand Down

0 comments on commit bf6a5b4

Please sign in to comment.