From f2c8cd0c8f7c73341221ee15919cef616604f070 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Mon, 7 Dec 2020 08:42:49 +0000 Subject: [PATCH 1/4] Fix crash in shutdown and trace memleaks in server. Signed-off-by: Ashley Pittman --- src/rdb/rdb_raft.c | 4 ++++ utils/node_local_test.py | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index 5bdde066a5d..18aeb81ff70 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -2558,6 +2558,10 @@ rdb_raft_resign(struct rdb *db, uint64_t term) struct rdb_raft_state state; int rc; + if (db == NULL) { + D_ERROR("db cannot be NULL\n"); + return; + } ABT_mutex_lock(db->d_raft_mutex); if (term != raft_get_current_term(db->d_raft) || !raft_is_leader(db->d_raft)) { diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c17921f1b81..701c87c9181 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -409,6 +409,26 @@ def stop(self): if not self._sp: return + rc = self.run_dmg(['system', 'stop']) + print(rc) + + start = time.time() + while True: + time.sleep(0.5) + rc = self.run_dmg(['system', 'query']) + print(rc) + ready = False + if rc.returncode == 0: + for line in rc.stdout.decode('utf-8').splitlines(): + if line.startswith('status'): + if 'Stopped' in line: + ready = True + if ready: + break + if time.time() - start > 20: + print('Failed to stop') + break + print('Server stopped in {:.2f} seconds'.format(time.time() - start)) # daos_server does not correctly shutdown daos_io_server yet # so find and kill daos_io_server directly. This may cause @@ -466,7 +486,7 @@ def stop(self): # often segfaults at shutdown. if os.path.exists(self._log_file): # TODO: Enable memleak checking when server shutdown works. - log_test(self.conf, self._log_file, show_memleaks=False) + log_test(self.conf, self._log_file, show_memleaks=True) self.running = False return ret From 148c634ebed0e774cad7703bf121ad27a0ad52a2 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Tue, 22 Dec 2020 16:00:11 +0000 Subject: [PATCH 2/4] Remove old workaround, and do not check for leaks. Signed-off-by: Ashley Pittman --- utils/node_local_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 701c87c9181..0f0e103f86c 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -423,6 +423,9 @@ def stop(self): if line.startswith('status'): if 'Stopped' in line: ready = True + if 'Stopping' in line: + ready = True + if ready: break if time.time() - start > 20: @@ -478,15 +481,11 @@ def stop(self): except ProcessLookupError: pass - # Workaround for DAOS-5648 - if ret == 2: - ret = 0 - # Show errors from server logs bug suppress memory leaks as the server # often segfaults at shutdown. if os.path.exists(self._log_file): # TODO: Enable memleak checking when server shutdown works. - log_test(self.conf, self._log_file, show_memleaks=True) + log_test(self.conf, self._log_file, show_memleaks=False) self.running = False return ret From 00a6d70ba80acd3c2333f385e18440b53a73954d Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Wed, 23 Dec 2020 17:17:53 +0000 Subject: [PATCH 3/4] Use dmg to format, rather than --recreate-superblocks. Signed-off-by: Ashley Pittman --- ci/unit/test_main_node.sh | 3 ++- utils/nlt_server.yaml | 2 +- utils/node_local_test.py | 29 +++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ci/unit/test_main_node.sh b/ci/unit/test_main_node.sh index 1ebbfb43e2a..bdda1ea0e3d 100755 --- a/ci/unit/test_main_node.sh +++ b/ci/unit/test_main_node.sh @@ -11,7 +11,6 @@ if grep /mnt/daos\ /proc/mounts; then fi sudo mkdir -p /mnt/daos -sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos sudo mkdir -p "$DAOS_BASE" sudo mount -t nfs "$HOSTNAME":"$HOSTPWD" "$DAOS_BASE" sudo cp "$DAOS_BASE/install/bin/daos_admin" /usr/bin/daos_admin @@ -46,8 +45,10 @@ fi cd "$DAOS_BASE" if ${NLT:-false}; then mkdir -p vm_test + # NLT will mount /mnt/daos itself. ./utils/node_local_test.py --output-file=vm_test/nlt-errors.json all else + sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos IS_CI=true OLD_CI=false RUN_TEST_VALGRIND="$WITH_VALGRIND" utils/run_test.sh if [ "$WITH_VALGRIND" == 'memcheck' ]; then diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 37e6fbc8804..14e56a54da3 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -23,4 +23,4 @@ servers: - FI_SOCKETS_CONN_TIMEOUT=2000 scm_mount: /mnt/daos scm_class: ram - scm_size: 4 + scm_size: 32 diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 0f0e103f86c..f9680d730cd 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -358,8 +358,7 @@ def start(self): server_env['PATH']) cmd = [daos_server, '--config={}'.format(self._yaml_file.name), - 'start', '-t' '4', '--insecure', '-d', self.agent_dir, - '--recreate-superblocks'] + 'start', '-t' '4', '--insecure', '-d', self.agent_dir] server_env['DAOS_DISABLE_REQ_FWD'] = '1' self._sp = subprocess.Popen(cmd, env=server_env) @@ -384,6 +383,23 @@ def start(self): # Use dmg to block until the server is ready to respond to requests. start = time.time() + + while True: + time.sleep(0.5) + rc = self.run_dmg(['storage', 'format']) + ready = False + if rc.returncode == 1: + for line in rc.stdout.decode('utf-8').splitlines(): + if 'format storage of running instance' in line: + ready = True + + if ready: + break + if time.time() - start > 20: + raise Exception("Failed to format") + + print('Format completion in {:.2f} seconds'.format(time.time() - start)) + while True: time.sleep(0.5) rc = self.run_dmg(['system', 'query']) @@ -391,7 +407,7 @@ def start(self): if rc.returncode == 0: for line in rc.stdout.decode('utf-8').splitlines(): if line.startswith('status'): - if 'Ready' in line or 'Joined' in line: + if 'Joined' in line: ready = True if ready: @@ -410,13 +426,12 @@ def stop(self): if not self._sp: return rc = self.run_dmg(['system', 'stop']) - print(rc) + assert rc.returncode == 0 start = time.time() while True: time.sleep(0.5) rc = self.run_dmg(['system', 'query']) - print(rc) ready = False if rc.returncode == 0: for line in rc.stdout.decode('utf-8').splitlines(): @@ -496,7 +511,9 @@ def run_dmg(self, cmd): exe_cmd.append('--insecure') exe_cmd.extend(cmd) - return subprocess.run(exe_cmd, stdout=subprocess.PIPE) + return subprocess.run(exe_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) def il_cmd(dfuse, cmd, check_read=True, check_write=True): """Run a command under the interception library From 8fd7878b5ce7a79465804813162b3ccc2774de80 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Wed, 23 Dec 2020 18:21:50 +0000 Subject: [PATCH 4/4] Handle the case where /mnt/daos is mounted, but empty. Signed-off-by: Ashley Pittman --- utils/node_local_test.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index f9680d730cd..d2114c14b08 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -381,18 +381,25 @@ def start(self): self.conf.agent_dir = self.agent_dir self.running = True - # Use dmg to block until the server is ready to respond to requests. + # Configure the storage. DAOS wants to mount /mnt/daos itself if not + # already mounted, so let it do that. + # This code supports three modes of operation: + # /mnt/daos is not mounted. It will be mounted and formatted. + # /mnt/daos is mounted but empty. It will be remounted and formatted + # /mnt/daos exists and has data in. It will be used as is. start = time.time() + cmd = ['storage', 'format'] while True: time.sleep(0.5) - rc = self.run_dmg(['storage', 'format']) + rc = self.run_dmg(cmd) ready = False if rc.returncode == 1: for line in rc.stdout.decode('utf-8').splitlines(): if 'format storage of running instance' in line: ready = True - + if 'format request for already-formatted storage and reformat not specified' in line: + cmd = ['storage', 'format', '--reformat'] if ready: break if time.time() - start > 20: @@ -400,6 +407,7 @@ def start(self): print('Format completion in {:.2f} seconds'.format(time.time() - start)) + # How wait until the system is up, basically the format to happen. while True: time.sleep(0.5) rc = self.run_dmg(['system', 'query'])