diff --git a/ci/unit/test_main_node.sh b/ci/unit/test_main_node.sh index 1ebbfb43e2a..bdda1ea0e3d 100755 --- a/ci/unit/test_main_node.sh +++ b/ci/unit/test_main_node.sh @@ -11,7 +11,6 @@ if grep /mnt/daos\ /proc/mounts; then fi sudo mkdir -p /mnt/daos -sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos sudo mkdir -p "$DAOS_BASE" sudo mount -t nfs "$HOSTNAME":"$HOSTPWD" "$DAOS_BASE" sudo cp "$DAOS_BASE/install/bin/daos_admin" /usr/bin/daos_admin @@ -46,8 +45,10 @@ fi cd "$DAOS_BASE" if ${NLT:-false}; then mkdir -p vm_test + # NLT will mount /mnt/daos itself. ./utils/node_local_test.py --output-file=vm_test/nlt-errors.json all else + sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos IS_CI=true OLD_CI=false RUN_TEST_VALGRIND="$WITH_VALGRIND" utils/run_test.sh if [ "$WITH_VALGRIND" == 'memcheck' ]; then diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index a8501b0b66d..d19e9092d07 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -2558,6 +2558,10 @@ rdb_raft_resign(struct rdb *db, uint64_t term) struct rdb_raft_state state; int rc; + if (db == NULL) { + D_ERROR("db cannot be NULL\n"); + return; + } ABT_mutex_lock(db->d_raft_mutex); if (term != raft_get_current_term(db->d_raft) || !raft_is_leader(db->d_raft)) { diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 37e6fbc8804..14e56a54da3 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -23,4 +23,4 @@ servers: - FI_SOCKETS_CONN_TIMEOUT=2000 scm_mount: /mnt/daos scm_class: ram - scm_size: 4 + scm_size: 32 diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 77e654dc0af..82066327694 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -358,8 +358,7 @@ def start(self): server_env['PATH']) cmd = [daos_server, '--config={}'.format(self._yaml_file.name), - 'start', '-t' '4', '--insecure', '-d', self.agent_dir, - '--recreate-superblocks'] + 'start', '-t' '4', '--insecure', '-d', self.agent_dir] server_env['DAOS_DISABLE_REQ_FWD'] = '1' self._sp = subprocess.Popen(cmd, env=server_env) @@ -382,8 +381,33 @@ def start(self): self.conf.agent_dir = self.agent_dir self.running = True - # Use dmg to block until the server is ready to respond to requests. + # Configure the storage. DAOS wants to mount /mnt/daos itself if not + # already mounted, so let it do that. + # This code supports three modes of operation: + # /mnt/daos is not mounted. It will be mounted and formatted. + # /mnt/daos is mounted but empty. It will be remounted and formatted + # /mnt/daos exists and has data in. It will be used as is. start = time.time() + + cmd = ['storage', 'format'] + while True: + time.sleep(0.5) + rc = self.run_dmg(cmd) + ready = False + if rc.returncode == 1: + for line in rc.stdout.decode('utf-8').splitlines(): + if 'format storage of running instance' in line: + ready = True + if 'format request for already-formatted storage and reformat not specified' in line: + cmd = ['storage', 'format', '--reformat'] + if ready: + break + if time.time() - start > 20: + raise Exception("Failed to format") + + print('Format completion in {:.2f} seconds'.format(time.time() - start)) + + # How wait until the system is up, basically the format to happen. while True: time.sleep(0.5) rc = self.run_dmg(['system', 'query']) @@ -391,7 +415,7 @@ def start(self): if rc.returncode == 0: for line in rc.stdout.decode('utf-8').splitlines(): if line.startswith('status'): - if 'Ready' in line or 'Joined' in line: + if 'Joined' in line: ready = True if ready: @@ -409,6 +433,28 @@ def stop(self): if not self._sp: return + rc = self.run_dmg(['system', 'stop']) + assert rc.returncode == 0 + + start = time.time() + while True: + time.sleep(0.5) + rc = self.run_dmg(['system', 'query']) + ready = False + if rc.returncode == 0: + for line in rc.stdout.decode('utf-8').splitlines(): + if line.startswith('status'): + if 'Stopped' in line: + ready = True + if 'Stopping' in line: + ready = True + + if ready: + break + if time.time() - start > 20: + print('Failed to stop') + break + print('Server stopped in {:.2f} seconds'.format(time.time() - start)) # daos_server does not correctly shutdown daos_io_server yet # so find and kill daos_io_server directly. This may cause @@ -458,10 +504,6 @@ def stop(self): except ProcessLookupError: pass - # Workaround for DAOS-5648 - if ret == 2: - ret = 0 - # Show errors from server logs bug suppress memory leaks as the server # often segfaults at shutdown. if os.path.exists(self._log_file): @@ -477,7 +519,9 @@ def run_dmg(self, cmd): exe_cmd.append('--insecure') exe_cmd.extend(cmd) - return subprocess.run(exe_cmd, stdout=subprocess.PIPE) + return subprocess.run(exe_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) def il_cmd(dfuse, cmd, check_read=True, check_write=True): """Run a command under the interception library