Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-6287 test: Verify rebuild continues after one of the ranks is st… #14100

Merged
merged 11 commits into from
Apr 22, 2024
125 changes: 125 additions & 0 deletions src/tests/ftest/rebuild/continues_after_stop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
(C) Copyright 2024 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import os
import threading
import time

from command_utils_base import CommandFailure
from general_utils import get_journalctl, journalctl_time
from ior_test_base import IorTestBase
from ior_utils import IorCommand
from job_manager_utils import get_job_manager


class ContinuesAfterStop(IorTestBase):
"""Verify rebuild continues after one of the ranks is stopped.

:avocado: recursive
"""
def run_ior_basic(self, namespace, pool, container):
"""Run IOR once with configurations in the test yaml.

Args:
namespace (str): Namespace that defines block_size and transfer_size.
pool (TestPool): Pool to use with IOR.
container (TestContainer): Container to use with IOR.
"""
ior_cmd = IorCommand(namespace=namespace)
ior_cmd.get_params(self)
ior_cmd.set_daos_params(self.server_group, pool, container.identifier)
testfile = os.path.join(os.sep, "test_file_1")
ior_cmd.test_file.update(testfile)
manager = get_job_manager(test=self, job=ior_cmd, subprocess=self.subprocess)
manager.assign_hosts(
self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
ppn = self.params.get("ppn", namespace)
manager.assign_processes(ppn=ppn)

try:
manager.run()
except CommandFailure as error:
self.log.info(error)
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved

def test_continuous_after_stop(self):
"""Verify rebuild continues after one of the ranks is stopped.

1. Create a pool and a container.
2. Run IOR that takes several seconds with a thread.
3. After a few seconds, stop one of the ranks (rank 3).
4. Look for the start of the rebuild (Rebuild [scanning]) in journalctl with daos_server
identifier.
5. As soon as the message is detected, stop the rest of the ranks (0, 1, 2).
6. Restart the three ranks.
7. Wait for rebuild to finish.

Jira ID: DAOS-6287

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=rebuild
:avocado: tags=ContinuesAfterStop,test_continuous_after_stop
"""
# 1. Create a pool and a container.
self.log_step("Create a pool and a container.")
pool = self.get_pool()
container = self.get_container(pool=pool)

# 2. Run IOR that takes several seconds with a thread.
self.log_step("Run IOR that takes several seconds with a thread.")
kwargs = {
"namespace": "/run/ior/*",
"pool": pool,
"container": container
}
thread = threading.Thread(target=self.run_ior_basic, kwargs=kwargs)
ior_start_time = journalctl_time()
thread.start()

# 3. After a few seconds, stop one of the ranks (rank 3).
self.log_step("After a few seconds, stop one of the ranks (rank 3).")
# Wait for IOR to start and write some data. Otherwise rebuild will not occur.
time.sleep(5)
self.server_managers[0].stop_ranks(ranks=[3], daos_log=self.log)

# 4. Look for the start of the rebuild (Rebuild [scanning]) in journalctl with daos_server
# identifier.
msg = ("4. Look for the start of the rebuild (Rebuild [scanning]) in journalctl with "
"daos_server identifier.")
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved
self.log_step(msg)
scanning_found = False
for count in range(120):
self.log.info("Look for 'Rebuild [scanning]'. Count = %d", count)
journalctl_out = get_journalctl(
hosts=self.hostlist_servers, since=ior_start_time, until=None,
journalctl_type="daos_server")
for _, journalctl in enumerate(journalctl_out):
data = journalctl["data"]
for line in data.splitlines():
if "Rebuild [scanning]" in line:
self.log.info("'Rebuild [scanning]' found: %s", line)
scanning_found = True
break
if scanning_found:
break
if scanning_found:
break
time.sleep(1)
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved

thread.join()
if not scanning_found:
self.fail("'Rebuild [scanning]' wasn't found in journalctl after stopping a rank!")

# 5. As soon as the message is detected, stop the rest of the ranks (0, 1, 2).
self.log_step("As soon as the message is detected, stop the rest of the ranks (0, 1, 2).")
self.server_managers[0].stop_ranks(ranks=[0, 1, 2], daos_log=self.log)

# 6. Restart the three ranks.
self.log_step("Restart the three ranks.")
self.server_managers[0].start_ranks(ranks=[0, 1, 2], daos_log=self.log)

# 7. Wait for rebuild to finish.
self.log_step("Wait for rebuild to finish.")
pool.wait_for_rebuild_to_end(interval=5)
39 changes: 39 additions & 0 deletions src/tests/ftest/rebuild/continues_after_stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
hosts:
test_servers: 2
test_clients: 1

timeout: 300

server_config:
name: daos_server
engines_per_host: 2
engines:
0:
targets: 4
nr_xs_helpers: 1
fabric_iface: ib0
fabric_iface_port: 31317
log_file: daos_server_0.log
storage: auto
1:
targets: 4
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31417
log_file: daos_server_1.log
storage: auto

pool:
size: 10%

container:
type: POSIX
control_method: daos

ior:
flags: -w
api: DFS
ppn: 1
oclass: SX
block_size: 2G
transfer_size: 256K
Loading