Skip to content

Commit

Permalink
DAOS-3885 test: Fix crash in shutdown and try dmg shutdown in NLT (#4022
Browse files Browse the repository at this point in the history
)

* Fix crash in shutdown and trace memleaks in server.
* Remove old workaround, and do not check for leaks.
* Use dmg to format, rather than --recreate-superblocks.
* Handle the case where /mnt/daos is mounted, but empty.

Signed-off-by: Ashley Pittman <[email protected]>
  • Loading branch information
ashleypittman authored Jan 5, 2021
1 parent acbc380 commit b066801
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 11 deletions.
3 changes: 2 additions & 1 deletion ci/unit/test_main_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ if grep /mnt/daos\ /proc/mounts; then
fi
sudo mkdir -p /mnt/daos

sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos
sudo mkdir -p "$DAOS_BASE"
sudo mount -t nfs "$HOSTNAME":"$HOSTPWD" "$DAOS_BASE"
sudo cp "$DAOS_BASE/install/bin/daos_admin" /usr/bin/daos_admin
Expand Down Expand Up @@ -46,8 +45,10 @@ fi
cd "$DAOS_BASE"
if ${NLT:-false}; then
mkdir -p vm_test
# NLT will mount /mnt/daos itself.
./utils/node_local_test.py --output-file=vm_test/nlt-errors.json all
else
sudo mount -t tmpfs -o size=16G tmpfs /mnt/daos
IS_CI=true OLD_CI=false RUN_TEST_VALGRIND="$WITH_VALGRIND" utils/run_test.sh

if [ "$WITH_VALGRIND" == 'memcheck' ]; then
Expand Down
4 changes: 4 additions & 0 deletions src/rdb/rdb_raft.c
Original file line number Diff line number Diff line change
Expand Up @@ -2558,6 +2558,10 @@ rdb_raft_resign(struct rdb *db, uint64_t term)
struct rdb_raft_state state;
int rc;

if (db == NULL) {
D_ERROR("db cannot be NULL\n");
return;
}
ABT_mutex_lock(db->d_raft_mutex);
if (term != raft_get_current_term(db->d_raft) ||
!raft_is_leader(db->d_raft)) {
Expand Down
2 changes: 1 addition & 1 deletion utils/nlt_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ servers:
- FI_SOCKETS_CONN_TIMEOUT=2000
scm_mount: /mnt/daos
scm_class: ram
scm_size: 4
scm_size: 32
62 changes: 53 additions & 9 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,7 @@ def start(self):
server_env['PATH'])

cmd = [daos_server, '--config={}'.format(self._yaml_file.name),
'start', '-t' '4', '--insecure', '-d', self.agent_dir,
'--recreate-superblocks']
'start', '-t' '4', '--insecure', '-d', self.agent_dir]

server_env['DAOS_DISABLE_REQ_FWD'] = '1'
self._sp = subprocess.Popen(cmd, env=server_env)
Expand All @@ -382,16 +381,41 @@ def start(self):
self.conf.agent_dir = self.agent_dir
self.running = True

# Use dmg to block until the server is ready to respond to requests.
# Configure the storage. DAOS wants to mount /mnt/daos itself if not
# already mounted, so let it do that.
# This code supports three modes of operation:
# /mnt/daos is not mounted. It will be mounted and formatted.
# /mnt/daos is mounted but empty. It will be remounted and formatted
# /mnt/daos exists and has data in. It will be used as is.
start = time.time()

cmd = ['storage', 'format']
while True:
time.sleep(0.5)
rc = self.run_dmg(cmd)
ready = False
if rc.returncode == 1:
for line in rc.stdout.decode('utf-8').splitlines():
if 'format storage of running instance' in line:
ready = True
if 'format request for already-formatted storage and reformat not specified' in line:
cmd = ['storage', 'format', '--reformat']
if ready:
break
if time.time() - start > 20:
raise Exception("Failed to format")

print('Format completion in {:.2f} seconds'.format(time.time() - start))

# How wait until the system is up, basically the format to happen.
while True:
time.sleep(0.5)
rc = self.run_dmg(['system', 'query'])
ready = False
if rc.returncode == 0:
for line in rc.stdout.decode('utf-8').splitlines():
if line.startswith('status'):
if 'Ready' in line or 'Joined' in line:
if 'Joined' in line:
ready = True

if ready:
Expand All @@ -409,6 +433,28 @@ def stop(self):

if not self._sp:
return
rc = self.run_dmg(['system', 'stop'])
assert rc.returncode == 0

start = time.time()
while True:
time.sleep(0.5)
rc = self.run_dmg(['system', 'query'])
ready = False
if rc.returncode == 0:
for line in rc.stdout.decode('utf-8').splitlines():
if line.startswith('status'):
if 'Stopped' in line:
ready = True
if 'Stopping' in line:
ready = True

if ready:
break
if time.time() - start > 20:
print('Failed to stop')
break
print('Server stopped in {:.2f} seconds'.format(time.time() - start))

# daos_server does not correctly shutdown daos_io_server yet
# so find and kill daos_io_server directly. This may cause
Expand Down Expand Up @@ -458,10 +504,6 @@ def stop(self):
except ProcessLookupError:
pass

# Workaround for DAOS-5648
if ret == 2:
ret = 0

# Show errors from server logs bug suppress memory leaks as the server
# often segfaults at shutdown.
if os.path.exists(self._log_file):
Expand All @@ -477,7 +519,9 @@ def run_dmg(self, cmd):
exe_cmd.append('--insecure')
exe_cmd.extend(cmd)

return subprocess.run(exe_cmd, stdout=subprocess.PIPE)
return subprocess.run(exe_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)

def il_cmd(dfuse, cmd, check_read=True, check_write=True):
"""Run a command under the interception library
Expand Down

0 comments on commit b066801

Please sign in to comment.