Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-11736 test: CR Pass 2 - Dangling pool map test #12517

Merged
merged 13 commits into from
Aug 2, 2023
Merged
71 changes: 70 additions & 1 deletion src/tests/ftest/recovery/pool_membership.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def test_orphan_pool_shard(self):
# i.e., Current status is COMPLETED.
errors = []
query_msg = ""
for _ in range(8):
for _ in range(20):
check_query_out = dmg_command.check_query()
if check_query_out["response"]["status"] == "COMPLETED":
query_msg = check_query_out["response"]["reports"][0]["msg"]
Expand Down Expand Up @@ -211,3 +211,72 @@ def test_orphan_pool_shard(self):
errors.append(msg)

report_errors(test=self, errors=errors)

def test_dangling_pool_map(self):
"""Test dangling pool map.

1. Create a pool.
2. Stop servers.
3. Manually remove /mnt/daos0/<pool_uuid>/vos-0 from rank 0 node.
4. Enable and start the checker.
5. Query the checker and verify that the issue was fixed. i.e., Current status is
COMPLETED.
6. Disable the checker.
7. Verify that the pool has one less target.

Jira ID: DAOS-11736

:avocado: tags=all,pr
:avocado: tags=hw,medium
:avocado: tags=recovery,pool_membership
:avocado: tags=PoolMembershipTest,test_dangling_pool_map
"""
# 1. Create a pool.
self.log_step("Creating a pool (dmg pool create)")
pool = self.get_pool(connect=False)

# 2. Stop servers.
dmg_command = self.get_dmg_command()
dmg_command.system_stop()

# 3. Manually remove /mnt/daos0/<pool_uuid>/vos-0 from rank 0 node.
rank_0_host = NodeSet(self.server_managers[0].get_host(0))
rm_cmd = f"sudo rm /mnt/daos0/{pool.uuid.lower()}/vos-0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of hardcoding /mnt/daos0, we should do something like:

self.server_managers[0].get_config_value("scm_mount")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Thanks.

if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed:
self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}")

# 4. Enable and start the checker.
self.log_step("Enable and start the checker.")
dmg_command.check_enable(stop=False)
dmg_command.check_start()

# 5. Query the checker and verify that the issue was fixed.
errors = []
query_msg = ""
for _ in range(20):
check_query_out = dmg_command.check_query()
if check_query_out["response"]["status"] == "COMPLETED":
query_msg = check_query_out["response"]["reports"][0]["msg"]
break
time.sleep(5)
if "dangling target" not in query_msg:
errors.append(
"Checker didn't fix orphan pool shard! msg = {}".format(query_msg))

# 6. Disable the checker.
self.log_step("Disable and start the checker.")
dmg_command.check_disable()

# 7. Verify that the pool has one less target.
query_out = pool.query()
self.log.debug("## query_out = %s", query_out)
total_targets = query_out["response"]["total_targets"]
active_targets = query_out["response"]["active_targets"]
diff = total_targets - active_targets
if diff != 1:
expected_targets = total_targets - 1
msg = (f"Unexpected number of active targets! Expected = {expected_targets}; "
f"Actual = {active_targets}")
errors.append(msg)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor - recommend making this check more clear

Suggested change
total_targets = query_out["response"]["total_targets"]
active_targets = query_out["response"]["active_targets"]
diff = total_targets - active_targets
if diff != 1:
expected_targets = total_targets - 1
msg = (f"Unexpected number of active targets! Expected = {expected_targets}; "
f"Actual = {active_targets}")
errors.append(msg)
total_targets = query_out["response"]["total_targets"]
active_targets = query_out["response"]["active_targets"]
expected_targets = total_targets - 1
if active_targets != expected_targets:
msg = (f"Unexpected number of active targets! Expected = {expected_targets}; "
f"Actual = {active_targets}")
errors.append(msg)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


report_errors(test=self, errors=errors)