Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-6287 test: Verify rebuild continues after one of the ranks is st… #14100

Merged
merged 11 commits into from
Apr 22, 2024
122 changes: 122 additions & 0 deletions src/tests/ftest/rebuild/continues_after_stop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
(C) Copyright 2024 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import os
import threading
import time

from command_utils_base import CommandFailure
from general_utils import get_journalctl, journalctl_time, wait_for_result
from ior_test_base import IorTestBase
from ior_utils import IorCommand
from job_manager_utils import get_job_manager


class ContinuesAfterStop(IorTestBase):
"""Verify rebuild continues after one of the ranks is stopped.

:avocado: recursive
"""
def __init__(self, *args, **kwargs):
"""Initialize a ContinuesAfterStop object."""
super().__init__(*args, **kwargs)
self.search_count = 0

def run_ior_basic(self, namespace, pool, container):
"""Run IOR once with configurations in the test yaml.

Args:
namespace (str): Namespace that defines block_size and transfer_size.
pool (TestPool): Pool to use with IOR.
container (TestContainer): Container to use with IOR.
"""
ior_cmd = IorCommand(namespace=namespace)
ior_cmd.get_params(self)
ior_cmd.set_daos_params(self.server_group, pool, container.identifier)
testfile = os.path.join(os.sep, "test_file_1")
ior_cmd.test_file.update(testfile)
manager = get_job_manager(test=self, job=ior_cmd, subprocess=self.subprocess)
manager.assign_hosts(
self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
ppn = self.params.get("ppn", namespace)
manager.assign_processes(ppn=ppn)

try:
manager.run()
except CommandFailure as error:
self.log.info(error)
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved

def test_continuous_after_stop(self):
"""Verify rebuild continues after one of the ranks is stopped.

1. Create a pool and a container.
2. Run IOR that takes several seconds with a thread.
3. After a few seconds, stop one of the ranks (rank 3).
4. Look for the start of the rebuild (Rebuild [scanning]) in journalctl with daos_server
identifier.
5. As soon as the message is detected, stop the rest of the ranks (0, 1, 2).
6. Restart the three ranks.
7. Wait for rebuild to finish.

Jira ID: DAOS-6287

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=rebuild
:avocado: tags=ContinuesAfterStop,test_continuous_after_stop
"""
self.log_step("Create a pool and a container.")
pool = self.get_pool()
container = self.get_container(pool=pool)

self.log_step("Run IOR that takes several seconds with a thread.")
kwargs = {
"namespace": "/run/ior/*",
"pool": pool,
"container": container
}
thread = threading.Thread(target=self.run_ior_basic, kwargs=kwargs)
ior_start_time = journalctl_time()
thread.start()

self.log_step("After a few seconds, stop one of the ranks (rank 3).")
# Wait for IOR to start and write some data. Otherwise rebuild will not occur.
time.sleep(5)
self.server_managers[0].stop_ranks(ranks=[3], daos_log=self.log)

msg = ("4. Look for the start of the rebuild (Rebuild [scanning]) in journalctl with "
"daos_server identifier.")
self.log_step(msg)

def _search_scanning():
"""Search 'Rebuild [scanning]' from journalctl output using wait_for_result().
"""
self.log.info("Search 'Rebuild [scanning]'. Count = %d", self.search_count)
self.search_count += 1
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved
journalctl_out = get_journalctl(
hosts=self.hostlist_servers, since=ior_start_time, until=None,
journalctl_type="daos_server")

for _, journalctl in enumerate(journalctl_out):
data = journalctl["data"]
for line in data.splitlines():
if "Rebuild [scanning]" in line:
self.log.info("'Rebuild [scanning]' found: %s", line)
return True
return False

scanning_found = wait_for_result(self.log, _search_scanning, timeout=120, delay=1)
thread.join()
if not scanning_found:
self.fail("'Rebuild [scanning]' wasn't found in journalctl after stopping a rank!")

self.log_step("As soon as the message is detected, stop the rest of the ranks (0, 1, 2).")
self.server_managers[0].stop_ranks(ranks=[0, 1, 2], daos_log=self.log)

self.log_step("Restart the three ranks.")
self.server_managers[0].start_ranks(ranks=[0, 1, 2], daos_log=self.log)

self.log_step("Wait for rebuild to finish.")
pool.wait_for_rebuild_to_end(interval=5)
39 changes: 39 additions & 0 deletions src/tests/ftest/rebuild/continues_after_stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
hosts:
test_servers: 2
test_clients: 1

timeout: 300

server_config:
name: daos_server
engines_per_host: 2
engines:
0:
targets: 4
nr_xs_helpers: 1
fabric_iface: ib0
fabric_iface_port: 31317
log_file: daos_server_0.log
storage: auto
1:
targets: 4
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31417
log_file: daos_server_1.log
storage: auto

pool:
size: 10%

container:
type: POSIX
control_method: daos

ior:
flags: -w
api: DFS
ppn: 1
oclass: SX
block_size: 2G
transfer_size: 256K
Loading