Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-11736 test: CR Pass 2 - Dangling pool map test #12517

Merged
merged 13 commits into from
Aug 2, 2023
Merged
91 changes: 90 additions & 1 deletion src/tests/ftest/recovery/pool_membership.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def test_orphan_pool_shard(self):
# i.e., Current status is COMPLETED.
errors = []
query_msg = ""
for _ in range(8):
for _ in range(20):
check_query_out = dmg_command.check_query()
if check_query_out["response"]["status"] == "COMPLETED":
query_msg = check_query_out["response"]["reports"][0]["msg"]
Expand Down Expand Up @@ -211,3 +211,92 @@ def test_orphan_pool_shard(self):
errors.append(msg)

report_errors(test=self, errors=errors)

def test_dangling_pool_map(self):
"""Test dangling pool map.

1. Create a pool.
2. Stop servers.
3. Manually remove /<scm_mount>/<pool_uuid>/vos-0 from rank 0 node.
4. Enable and start the checker.
5. Query the checker and verify that the issue was fixed. i.e., Current status is
COMPLETED.
6. Disable the checker.
7. Verify that the pool has one less target.

Jira ID: DAOS-11736

:avocado: tags=all,pr
:avocado: tags=hw,medium
:avocado: tags=recovery,pool_membership
:avocado: tags=PoolMembershipTest,test_dangling_pool_map
"""
# 1. Create a pool.
self.log_step("Creating a pool (dmg pool create)")
pool = self.get_pool(connect=False)

# 2. Stop servers.
dmg_command = self.get_dmg_command()
dmg_command.system_stop()

# 3. Manually remove /<scm_mount>/<pool_uuid>/vos-0 from rank 0 node.
rank_0_host = NodeSet(self.server_managers[0].get_host(0))
scm_mount = self.server_managers[0].get_config_value("scm_mount")
rm_cmd = f"sudo rm {scm_mount}/{pool.uuid.lower()}/vos-0"
if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed:
self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}")

# 4. Enable and start the checker.
self.log_step("Enable and start the checker.")
dmg_command.check_enable(stop=False)

errors = []
query_msg = ""

# If we start the checker right after enabling it, the checker may not detect any
# fault (can't reproduce manually). If it happens, stop and restart the checker.
daltonbohning marked this conversation as resolved.
Show resolved Hide resolved
repair_reports = None
restart_count = 0
while restart_count < 5:
# Start checker.
dmg_command.check_start()

# 5. Query the checker and verify that the issue was fixed.
for _ in range(8):
check_query_out = dmg_command.check_query()
if check_query_out["response"]["status"] == "COMPLETED":
repair_reports = check_query_out["response"]["reports"]
# query_msg = check_query_out["response"]["reports"][0]["msg"]
break
time.sleep(5)

if repair_reports:
break

self.log.info("Checker didn't detect fault. Restart %d", restart_count)
dmg_command.check_stop()
restart_count += 1

if not repair_reports:
self.fail("Checker didn't detect or repair any inconsistency!")

query_msg = repair_reports[0]["msg"]
if "dangling target" not in query_msg:
errors.append(
"Checker didn't fix orphan pool shard! msg = {}".format(query_msg))

# 6. Disable the checker.
self.log_step("Disable and start the checker.")
dmg_command.check_disable()

# 7. Verify that the pool has one less target.
query_out = pool.query()
total_targets = query_out["response"]["total_targets"]
active_targets = query_out["response"]["active_targets"]
expected_targets = total_targets - 1
if active_targets != expected_targets:
msg = (f"Unexpected number of active targets! Expected = {expected_targets}; "
f"Actual = {active_targets}")
errors.append(msg)

report_errors(test=self, errors=errors)