Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-12287 test: CR Pass 4 - Orphan container #13063

Merged
merged 10 commits into from
Oct 20, 2023
178 changes: 178 additions & 0 deletions src/tests/ftest/recovery/container_list_consolidation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
(C) Copyright 2023 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import time
import re
from ClusterShell.NodeSet import NodeSet

from recovery_test_base import RecoveryTestBase
from general_utils import report_errors
from ddb_utils import DdbCommand
from exception_utils import CommandFailure


class ContainerListConsolidationTest(RecoveryTestBase):
"""Test Pass 4: Container List Consolidation

:avocado: recursive
"""

def wait_for_check_complete(self):
"""Repeatedly call dmg check query until status becomes COMPLETED.

If the status doesn't become COMPLETED, fail the test.

Returns:
list: List of repair reports.

"""
repair_reports = None
for _ in range(8):
check_query_out = self.get_dmg_command().check_query()
if check_query_out["response"]["status"] == "COMPLETED":
repair_reports = check_query_out["response"]["reports"]
break
time.sleep(5)

if not repair_reports:
self.fail("Checker didn't detect or repair any inconsistency!")

return repair_reports

def test_orphan_container(self):
"""Test orphan container. Container is in shard, but not in PS.

1. Create a pool and a container.
2. Inject fault to cause orphan container. i.e., container is left in the system,
but doesn't appear with daos commands.
3. Check that the container doesn't appear with daos command.
4. Stop servers.
5. Use ddb to verify that the container is left in shards.
6. Enable the checker.
7. Set policy to --all-interactive.
8. Start the checker and query the checker until the fault is detected.
9. Repair by selecting the destroy option.
10. Query the checker until the fault is repaired.
11. Disable the checker.
12. Run the ddb command and verify that the container is removed from shard.

Jira ID: DAOS-12287

:avocado: tags=all,pr
:avocado: tags=vm
:avocado: tags=recovery,container_list_consolidation
:avocado: tags=ContainerListConsolidationTest,test_orphan_container
"""
# 1. Create a pool and a container.
self.log_step("Create a pool and a container")
pool = self.get_pool(connect=False)
container = self.get_container(pool=pool)

# 2. Inject fault to cause orphan container.
self.log_step("Inject fault to cause orphan container.")
daos_command = self.get_daos_command()
daos_command.faults_container(
pool=pool.identifier, cont=container.identifier,
location="DAOS_CHK_CONT_ORPHAN")

# 3. Check that the container doesn't appear with daos command.
self.log_step("Check that the container doesn't appear with daos command.")
pool_list = daos_command.pool_list_containers(pool=pool.identifier)
errors = []
if pool_list["response"]:
errors.append(f"Container appears with daos command! {pool_list}")

# 4. Stop servers.
self.log_step("Stop servers.")
dmg_command = self.get_dmg_command()
dmg_command.system_stop()

# 5. Use ddb to verify that the container is left in shards.
self.log_step("Use ddb to verify that the container is left in shards.")
scm_mount = self.server_managers[0].get_config_value("scm_mount")
ddb_command = DdbCommand(
server_host=NodeSet(self.hostlist_servers[0]), path=self.bin,
mount_point=scm_mount, pool_uuid=pool.uuid,
vos_file=self.get_vos_file_path(pool=pool))
cmd_result = ddb_command.list_component()
ls_out = "\n".join(cmd_result[0]["stdout"])
uuid_regex = r"([0-f]{8}-[0-f]{4}-[0-f]{4}-[0-f]{4}-[0-f]{12})"
match = re.search(uuid_regex, ls_out)
if match is None:
self.fail("Unexpected output from ddb command, unable to parse.")
self.log.info("Container UUID from ddb ls = %s", match.group(1))

# UUID if found. Verify that it's the container UUID of the container we created.
actual_uuid = match.group(1)
expected_uuid = container.uuid.lower()
if actual_uuid != expected_uuid:
msg = "Unexpected container UUID! Expected = {}; Actual = {}".format(
expected_uuid, actual_uuid)
errors.append(msg)

# 6. Enable the checker.
self.log_step("Enable the checker.")
dmg_command.check_enable(stop=False)

# 7. Set policy to --all-interactive.
self.log_step("Set policy to --all-interactive.")
dmg_command.check_set_policy(all_interactive=True)

# 8. Start the checker and query the checker until the fault is detected.
self.log_step("Start and query the checker until the fault is detected.")
seq_num = None
# Start checker.
dmg_command.check_start()
# Query the checker until expected number of inconsistencies are repaired.
for _ in range(8):
check_query_out = dmg_command.check_query()
# Status is INIT before starting the checker.
if check_query_out["response"]["status"] == "RUNNING" and\
check_query_out["response"]["reports"]:
seq_num = check_query_out["response"]["reports"][0]["seq"]
break
time.sleep(5)
if not seq_num:
self.fail("Checker didn't detect any fault!")

# 9. Repair by selecting the destroy option, 0.
msg = ("Repair with option 0; Destroy the orphan container to release space "
"[suggested].")
self.log_step(msg)
dmg_command.check_repair(seq_num=seq_num, action=0)

# 10. Query the checker until the fault is repaired.
self.log_step("Query the checker until the fault is repaired.")
repair_report = self.wait_for_check_complete()[0]

# Verify that the repair report has expected message "Discard the container".
action_message = repair_report["act_msgs"][0]
exp_msg = "Discard the container"
errors = []
if exp_msg not in action_message:
errors.append(f"{exp_msg} not in {action_message}!")

# 11. Disable the checker.
self.log_step("Disable the checker.")
dmg_command.check_disable(start=False)

# 12. Run the ddb command and verify that the container is removed from shard.
self.log_step(
"Run the ddb command and verify that the container is removed from shard.")
cmd_result = ddb_command.list_component()
ls_out = "\n".join(cmd_result[0]["stdout"])
uuid_regex = r"([0-f]{8}-[0-f]{4}-[0-f]{4}-[0-f]{4}-[0-f]{12})"
match = re.search(uuid_regex, ls_out)
if match:
errors.append("Container UUID is found in shard! Checker didn't remove it.")

# Start server to prepare for the cleanup.
try:
dmg_command.system_start()
except CommandFailure as error:
# Handle the potential system start error just in case.
self.log.error(error)
finally:
report_errors(test=self, errors=errors)
24 changes: 24 additions & 0 deletions src/tests/ftest/recovery/container_list_consolidation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
hosts:
test_servers: 1

timeout: 360

server_config:
name: daos_server
engines_per_host: 1
engines:
0:
targets: 4
nr_xs_helpers: 0
storage:
0:
class: ram
scm_mount: /mnt/daos
system_ram_reserved: 1

pool:
size: 5G

container:
type: POSIX
control_method: daos
43 changes: 8 additions & 35 deletions src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
import re
from ClusterShell.NodeSet import NodeSet

from apricot import TestWithServers
from general_utils import report_errors, run_pcmd, insert_objects, \
distribute_files, DaosTestError, get_random_string, copy_remote_to_local
from recovery_test_base import RecoveryTestBase
from general_utils import report_errors, insert_objects, distribute_files, \
DaosTestError, get_random_string, copy_remote_to_local
from ddb_utils import DdbCommand
from exception_utils import CommandFailure


class DdbTest(TestWithServers):
class DdbTest(RecoveryTestBase):
"""Test ddb subcommands.

:avocado: recursive
Expand All @@ -32,33 +32,6 @@ def __init__(self, *args, **kwargs):
self.random_akey = get_random_string(10)
self.random_data = get_random_string(10)

def get_vos_file_path(self):
"""Get the VOS file path.

If there are multiple VOS files, returns the first file obtained by "ls".

Returns:
str: VOS file path such as /mnt/daos0/<pool_uuid>/vos-0

"""
hosts = NodeSet(self.hostlist_servers[0])
scm_mount = self.server_managers[0].get_config_value("scm_mount")
vos_path = os.path.join(scm_mount, self.pool.uuid.lower())
command = " ".join(["sudo", "ls", vos_path])
cmd_out = run_pcmd(hosts=hosts, command=command)

# return vos_file
for file in cmd_out[0]["stdout"]:
# Assume the VOS file has "vos" in the file name.
if "vos" in file:
self.log.info("vos_file: %s", file)
return file

self.fail("vos file wasn't found in {}/{}".format(
scm_mount, self.pool.uuid.lower()))

return None # to appease pylint

def test_recovery_ddb_ls(self):
"""Test ddb ls.

Expand All @@ -85,7 +58,7 @@ def test_recovery_ddb_ls(self):
ddb_command = DdbCommand(
server_host=NodeSet(self.hostlist_servers[0]), path=self.bin,
mount_point=scm_mount, pool_uuid=self.pool.uuid,
vos_file=self.get_vos_file_path())
vos_file=self.get_vos_file_path(pool=self.pool))

errors = []

Expand Down Expand Up @@ -277,7 +250,7 @@ def test_recovery_ddb_rm(self):
dmg_command.system_stop()

# 3. Find the vos file name.
vos_file = self.get_vos_file_path()
vos_file = self.get_vos_file_path(pool=self.pool)
host = NodeSet(self.hostlist_servers[0])
scm_mount = self.server_managers[0].get_config_value("scm_mount")
ddb_command = DdbCommand(
Expand Down Expand Up @@ -420,7 +393,7 @@ def test_recovery_ddb_load(self):
dmg_command.system_stop()

# 4. Find the vos file name.
vos_file = self.get_vos_file_path()
vos_file = self.get_vos_file_path(pool=self.pool)
host = NodeSet(self.hostlist_servers[0])
scm_mount = self.server_managers[0].get_config_value("scm_mount")
ddb_command = DdbCommand(
Expand Down Expand Up @@ -509,7 +482,7 @@ def test_recovery_ddb_dump_value(self):
dmg_command.system_stop()

# 4. Find the vos file name.
vos_file = self.get_vos_file_path()
vos_file = self.get_vos_file_path(pool=self.pool)
host = NodeSet(self.hostlist_servers[0])
scm_mount = self.server_managers[0].get_config_value("scm_mount")
ddb_command = DdbCommand(
Expand Down
41 changes: 41 additions & 0 deletions src/tests/ftest/util/daos_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,23 @@ def pool_list_attrs(self, pool, sys_name=None, verbose=False):
("pool", "list-attrs"), pool=pool, sys_name=sys_name,
verbose=verbose)

def pool_list_containers(self, pool, sys_name=None):
"""List containers in the pool.

Args:
pool (str): pool label or UUID
sys_name (str): DAOS system name. Defaults to None.

Returns:
dict: JSON output

Raises:
CommandFailure: if the daos pool list-containers command fails.

"""
return self._get_json_result(
("pool", "list-containers"), pool=pool, sys_name=sys_name)

def container_query(self, pool, cont, sys_name=None):
"""Query a container.

Expand Down Expand Up @@ -824,6 +841,30 @@ def object_query(self, pool, cont, oid, sys_name=None):

return data

def faults_container(self, pool, cont, location, sys_name=None, path=None, rank=None,
frequency=None):
"""Inject fault to a container.

Args:
pool (str): pool label or UUID
cont (str): container name or UUID
location (str): Fault injection location
sys_name (str): DAOS system name. Defaults to None.
path (str): unified namespace path. Defaults to None.
rank (str): Rank to inject fault on (default: 4294967295). Defaults to None.
frequency (str): Fault injection frequency (default: once). Defaults to None.

Returns:
dict: JSON output

Raises:
CommandFailure: if the command fails.

"""
return self._get_json_result(
("faults", "container"), pool=pool, cont=cont, location=location,
sys_name=sys_name, path=path, rank=rank, frequency=frequency)

def filesystem_copy(self, src, dst, preserve_props=None):
"""Copy a POSIX container or path to another POSIX container or path.

Expand Down
Loading