Skip to content

Commit

Permalink
DAOS-11736 test: CR Pass 2 - Dangling pool map test (#12517)
Browse files Browse the repository at this point in the history
1. Create a pool.
2. Stop servers.
3. Manually remove /mnt/daos0/<pool_uuid>/vos-0 from rank 0 node.
4. Enable and start the checker.
5. Query the checker and verify that the issue was fixed. i.e., Current status is COMPLETED.
6. Disable the checker.
7. Verify that the pool has one less target.

Signed-off-by: Makito Kano <[email protected]>
  • Loading branch information
shimizukko authored Aug 2, 2023
1 parent 02a43c1 commit 448373a
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 0 deletions.
84 changes: 84 additions & 0 deletions src/tests/ftest/recovery/pool_membership.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ def test_orphan_pool_shard(self):
# 6. Enable and start the checker.
self.log_step("Enable and start the checker.")
dmg_command.check_enable()

# If we call check start immediately after check enable, checker may not detect
# the fault. Developer is fixing this issue.
time.sleep(3)

dmg_command.check_start()

# 7. Query the checker and verify that the issue was fixed.
Expand Down Expand Up @@ -211,3 +216,82 @@ def test_orphan_pool_shard(self):
errors.append(msg)

report_errors(test=self, errors=errors)

def test_dangling_pool_map(self):
"""Test dangling pool map.
1. Create a pool.
2. Stop servers.
3. Manually remove /<scm_mount>/<pool_uuid>/vos-0 from rank 0 node.
4. Enable and start the checker.
5. Query the checker and verify that the issue was fixed. i.e., Current status is
COMPLETED.
6. Disable the checker.
7. Verify that the pool has one less target.
Jira ID: DAOS-11736
:avocado: tags=all,pr
:avocado: tags=hw,medium
:avocado: tags=recovery,pool_membership
:avocado: tags=PoolMembershipTest,test_dangling_pool_map
"""
# 1. Create a pool.
self.log_step("Creating a pool (dmg pool create)")
pool = self.get_pool(connect=False)

# 2. Stop servers.
dmg_command = self.get_dmg_command()
dmg_command.system_stop()

# 3. Manually remove /<scm_mount>/<pool_uuid>/vos-0 from rank 0 node.
rank_0_host = NodeSet(self.server_managers[0].get_host(0))
scm_mount = self.server_managers[0].get_config_value("scm_mount")
rm_cmd = f"sudo rm {scm_mount}/{pool.uuid.lower()}/vos-0"
if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed:
self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}")

# 4. Enable and start the checker.
self.log_step("Enable and start the checker.")
dmg_command.check_enable(stop=False)

# If we call check start immediately after check enable, checker may not detect
# the fault. Developer is fixing this issue.
time.sleep(3)

# Start checker.
dmg_command.check_start()

# 5. Query the checker and verify that the issue was fixed.
repair_reports = None
for _ in range(8):
check_query_out = dmg_command.check_query()
if check_query_out["response"]["status"] == "COMPLETED":
repair_reports = check_query_out["response"]["reports"]
break
time.sleep(5)

if not repair_reports:
self.fail("Checker didn't detect or repair any inconsistency!")

errors = []
query_msg = repair_reports[0]["msg"]
if "dangling target" not in query_msg:
errors.append(
"Checker didn't fix orphan pool shard! msg = {}".format(query_msg))

# 6. Disable the checker.
self.log_step("Disable and start the checker.")
dmg_command.check_disable()

# 7. Verify that the pool has one less target.
query_out = pool.query()
total_targets = query_out["response"]["total_targets"]
active_targets = query_out["response"]["active_targets"]
expected_targets = total_targets - 1
if active_targets != expected_targets:
msg = (f"Unexpected number of active targets! Expected = {expected_targets}; "
f"Actual = {active_targets}")
errors.append(msg)

report_errors(test=self, errors=errors)
12 changes: 12 additions & 0 deletions src/tests/ftest/util/dmg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,18 @@ def check_start(self, pool=None, dry_run=False, reset=False, failout=None, auto=
("check", "start"), pool=pool, dry_run=dry_run, reset=reset, failout=failout,
auto=auto, find_orphans=find_orphans, policies=policies)

def check_stop(self, pool=None):
"""Call dmg check stop.
Args:
pool (str): Pool label or UUID. Defaults to None.
Returns:
dict: the dmg json command output converted to a python dictionary
"""
return self._get_json_result(("check", "stop"), pool=pool)

def check_query(self, pool=None):
"""Call dmg check query.
Expand Down

0 comments on commit 448373a

Please sign in to comment.