From 7280719e52d5500d732a2bc8c607be726bc787dc Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 24 Jun 2023 05:38:47 +0000 Subject: [PATCH 01/12] DAOS-11736 test: CR Pass 2 - Dangling pool map test 1. Create a pool. 2. Stop servers. 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. 4. Enable and start the checker. 5. Query the checker and verify that the issue was fixed. i.e., Current status is COMPLETED. 6. Disable the checker. 7. Verify that the pool has one less target. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_dangling_pool_map Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 69 +++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 3638ce54a79..b191636058f 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -211,3 +211,72 @@ def test_orphan_pool_shard(self): errors.append(msg) report_errors(test=self, errors=errors) + + def test_dangling_pool_map(self): + """Test dangling pool map. + + 1. Create a pool. + 2. Stop servers. + 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. + 4. Enable and start the checker. + 5. Query the checker and verify that the issue was fixed. i.e., Current status is + COMPLETED. + 6. Disable the checker. + 7. Verify that the pool has one less target. + + Jira ID: DAOS-11736 + + :avocado: tags=all,pr + :avocado: tags=hw,medium + :avocado: tags=recovery,dangling_pool_map + :avocado: tags=PoolMembershipTest,test_dangling_pool_map + """ + # 1. Create a pool. + self.log_step("Creating a pool (dmg pool create)") + pool = self.get_pool(connect=False) + + # 2. Stop servers. + dmg_command = self.get_dmg_command() + dmg_command.system_stop() + + # 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. + rank_0_host = NodeSet(self.server_managers[0].get_host(0)) + rm_cmd = (f"sudo rm /mnt/daos0/{pool.uuid.lower()}/vos-0") + if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed: + self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}") + + # 4. Enable and start the checker. + self.log_step("Enable and start the checker.") + dmg_command.check_enable(stop=False) + dmg_command.check_start() + + # 5. Query the checker and verify that the issue was fixed. + errors = [] + query_msg = "" + for _ in range(10): + check_query_out = dmg_command.check_query() + if check_query_out["response"]["status"] == "COMPLETED": + query_msg = check_query_out["response"]["reports"][0]["msg"] + break + time.sleep(5) + if "dangling target" not in query_msg: + errors.append( + "Checker didn't fix orphan pool shard! msg = {}".format(query_msg)) + + # 6. Disable the checker. + self.log_step("Disable and start the checker.") + dmg_command.check_disable() + + # 7. Verify that the pool has one less target. + query_out = pool.query() + self.log.debug("## query_out = %s", query_out) + total_targets = query_out["response"]["total_targets"] + active_targets = query_out["response"]["active_targets"] + diff = total_targets - active_targets + if diff != 1: + expected_targets = total_targets - 1 + msg = (f"Unexpected number of active targets! Expected = {expected_targets}; " + f"Actual = {active_targets}") + errors.append(msg) + + report_errors(test=self, errors=errors) From a54038b3e9e904c528fbfbcf65452e31d36a1dc1 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 24 Jun 2023 05:45:04 +0000 Subject: [PATCH 02/12] DAOS-11736 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_dangling_pool_map Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index b191636058f..783c7b2f5bb 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -241,7 +241,7 @@ def test_dangling_pool_map(self): # 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. rank_0_host = NodeSet(self.server_managers[0].get_host(0)) - rm_cmd = (f"sudo rm /mnt/daos0/{pool.uuid.lower()}/vos-0") + rm_cmd = f"sudo rm /mnt/daos0/{pool.uuid.lower()}/vos-0" if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed: self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}") From 1102fe0f7613e1fec5522512511adec0a9fb57bc Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 24 Jun 2023 05:51:24 +0000 Subject: [PATCH 03/12] DAOS-11736 test: Move the test to hw,small Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_dangling_pool_map Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 783c7b2f5bb..12c42f73631 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -58,7 +58,7 @@ def test_orphan_pool_shard(self): Jira ID: DAOS-11734 :avocado: tags=all,pr - :avocado: tags=hw,medium + :avocado: tags=hw,small :avocado: tags=recovery,pool_membership :avocado: tags=PoolMembershipTest,test_orphan_pool_shard """ @@ -227,7 +227,7 @@ def test_dangling_pool_map(self): Jira ID: DAOS-11736 :avocado: tags=all,pr - :avocado: tags=hw,medium + :avocado: tags=hw,small :avocado: tags=recovery,dangling_pool_map :avocado: tags=PoolMembershipTest,test_dangling_pool_map """ From 38119a2f3edd870bfe582a561a8ef08bfa17dd45 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 24 Jun 2023 05:52:47 +0000 Subject: [PATCH 04/12] DAOS-11736 test: Update tag Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 12c42f73631..375ccfb7467 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -228,7 +228,7 @@ def test_dangling_pool_map(self): :avocado: tags=all,pr :avocado: tags=hw,small - :avocado: tags=recovery,dangling_pool_map + :avocado: tags=recovery,pool_membership :avocado: tags=PoolMembershipTest,test_dangling_pool_map """ # 1. Create a pool. From 43ee84494ebc9f3bb2fb192b441259ca328b878a Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 24 Jun 2023 07:32:01 +0000 Subject: [PATCH 05/12] DAOS-11736 test: Use hw,medium Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 375ccfb7467..272463ddd33 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -58,7 +58,7 @@ def test_orphan_pool_shard(self): Jira ID: DAOS-11734 :avocado: tags=all,pr - :avocado: tags=hw,small + :avocado: tags=hw,medium :avocado: tags=recovery,pool_membership :avocado: tags=PoolMembershipTest,test_orphan_pool_shard """ @@ -227,7 +227,7 @@ def test_dangling_pool_map(self): Jira ID: DAOS-11736 :avocado: tags=all,pr - :avocado: tags=hw,small + :avocado: tags=hw,medium :avocado: tags=recovery,pool_membership :avocado: tags=PoolMembershipTest,test_dangling_pool_map """ From 13dfb88e1c0af7fb0806c67b5ca066d36dafe18d Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Thu, 6 Jul 2023 05:01:28 +0000 Subject: [PATCH 06/12] DAOS-11736 test: Increase dmg check query retry count Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 272463ddd33..ccdd1260fc9 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -253,7 +253,7 @@ def test_dangling_pool_map(self): # 5. Query the checker and verify that the issue was fixed. errors = [] query_msg = "" - for _ in range(10): + for _ in range(20): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "COMPLETED": query_msg = check_query_out["response"]["reports"][0]["msg"] From 3f41c84407a551d5b5abb6bd23df4b96c27cf3cf Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Thu, 6 Jul 2023 05:47:34 +0000 Subject: [PATCH 07/12] DAOS-11736 test: Increase dmg check query retry count for orphan pool shard Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index ccdd1260fc9..5c55842cad9 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -182,7 +182,7 @@ def test_orphan_pool_shard(self): # i.e., Current status is COMPLETED. errors = [] query_msg = "" - for _ in range(8): + for _ in range(20): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "COMPLETED": query_msg = check_query_out["response"]["reports"][0]["msg"] From 7ff8826e670c4110238431122ca866ad9977a1c5 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Fri, 14 Jul 2023 04:06:38 +0000 Subject: [PATCH 08/12] DAOS-11736 test: Use get_config_value to obtain mount point Add checker restart logic in case checker doesn't detect fault. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Test-repeat: 5 Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 48 +++++++++++++++------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 5c55842cad9..9d77b005b6b 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -217,7 +217,7 @@ def test_dangling_pool_map(self): 1. Create a pool. 2. Stop servers. - 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. + 3. Manually remove ///vos-0 from rank 0 node. 4. Enable and start the checker. 5. Query the checker and verify that the issue was fixed. i.e., Current status is COMPLETED. @@ -239,26 +239,48 @@ def test_dangling_pool_map(self): dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 3. Manually remove /mnt/daos0//vos-0 from rank 0 node. + # 3. Manually remove ///vos-0 from rank 0 node. rank_0_host = NodeSet(self.server_managers[0].get_host(0)) - rm_cmd = f"sudo rm /mnt/daos0/{pool.uuid.lower()}/vos-0" + scm_mount = self.server_managers[0].get_config_value("scm_mount") + rm_cmd = f"sudo rm {scm_mount}/{pool.uuid.lower()}/vos-0" if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed: self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}") # 4. Enable and start the checker. self.log_step("Enable and start the checker.") dmg_command.check_enable(stop=False) - dmg_command.check_start() - # 5. Query the checker and verify that the issue was fixed. errors = [] query_msg = "" - for _ in range(20): - check_query_out = dmg_command.check_query() - if check_query_out["response"]["status"] == "COMPLETED": - query_msg = check_query_out["response"]["reports"][0]["msg"] + + # If we start the checker right after enabling it, the checker may not detect any + # fault (can't reproduce manually). If it happens, stop and restart the checker. + repair_reports = None + restart_count = 0 + while restart_count < 5: + # Start checker. + dmg_command.check_start() + + # 5. Query the checker and verify that the issue was fixed. + for _ in range(8): + check_query_out = dmg_command.check_query() + if check_query_out["response"]["status"] == "COMPLETED": + repair_reports = check_query_out["response"]["reports"] + # query_msg = check_query_out["response"]["reports"][0]["msg"] + break + time.sleep(5) + + if repair_reports: break - time.sleep(5) + + self.log.info("Checker didn't detect fault. Restart %d", restart_count) + dmg_command.check_stop() + restart_count += 1 + + if not repair_reports: + self.fail("Checker didn't detect or repair any inconsistency!") + + query_msg = repair_reports[0]["msg"] if "dangling target" not in query_msg: errors.append( "Checker didn't fix orphan pool shard! msg = {}".format(query_msg)) @@ -269,12 +291,10 @@ def test_dangling_pool_map(self): # 7. Verify that the pool has one less target. query_out = pool.query() - self.log.debug("## query_out = %s", query_out) total_targets = query_out["response"]["total_targets"] active_targets = query_out["response"]["active_targets"] - diff = total_targets - active_targets - if diff != 1: - expected_targets = total_targets - 1 + expected_targets = total_targets - 1 + if active_targets != expected_targets: msg = (f"Unexpected number of active targets! Expected = {expected_targets}; " f"Actual = {active_targets}") errors.append(msg) From b9b835e7c09f7e34fd4a78209add2ebf855529fc Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Fri, 14 Jul 2023 23:52:01 +0000 Subject: [PATCH 09/12] DAOS-11736 test: Add check_stop() Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Test-repeat: 5 Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/util/dmg_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index b1c1e988760..5f2b438d678 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -1423,6 +1423,18 @@ def check_start(self, pool=None, dry_run=False, reset=False, failout=None, auto= ("check", "start"), pool=pool, dry_run=dry_run, reset=reset, failout=failout, auto=auto, find_orphans=find_orphans, policies=policies) + def check_stop(self, pool=None): + """Call dmg check stop. + + Args: + pool (str): Pool label or UUID. Defaults to None. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("check", "stop"), pool=pool) + def check_query(self, pool=None): """Call dmg check query. From 36938aa15a0b5079e9d4d50ff3da963db1a5ceeb Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sun, 16 Jul 2023 00:07:28 +0000 Subject: [PATCH 10/12] DAOS-11736 test: Remove unnecessary comment Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Test-repeat: 5 Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 9d77b005b6b..a006e995c72 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -182,7 +182,7 @@ def test_orphan_pool_shard(self): # i.e., Current status is COMPLETED. errors = [] query_msg = "" - for _ in range(20): + for _ in range(8): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "COMPLETED": query_msg = check_query_out["response"]["reports"][0]["msg"] @@ -266,7 +266,6 @@ def test_dangling_pool_map(self): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "COMPLETED": repair_reports = check_query_out["response"]["reports"] - # query_msg = check_query_out["response"]["reports"][0]["msg"] break time.sleep(5) From 6e615e942f602491dbfc7788d4b5c082a8d9c1ec Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 24 Jul 2023 11:49:31 +0000 Subject: [PATCH 11/12] DAOS-11736 test: Sleep 3 sec between check enable and check start Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Test-repeat: 7 Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 43 ++++++++++----------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index a006e995c72..52037e47293 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -176,6 +176,11 @@ def test_orphan_pool_shard(self): # 6. Enable and start the checker. self.log_step("Enable and start the checker.") dmg_command.check_enable() + + # If we call check start immediately after check enable, checker may not detect + # the fault. Developer is fixing this issue. + time.sleep(3) + dmg_command.check_start() # 7. Query the checker and verify that the issue was fixed. @@ -250,35 +255,29 @@ def test_dangling_pool_map(self): self.log_step("Enable and start the checker.") dmg_command.check_enable(stop=False) - errors = [] - query_msg = "" + # If we call check start immediately after check enable, checker may not detect + # the fault. Developer is fixing this issue. + time.sleep(3) + + # Start checker. + dmg_command.check_start() - # If we start the checker right after enabling it, the checker may not detect any - # fault (can't reproduce manually). If it happens, stop and restart the checker. - repair_reports = None - restart_count = 0 - while restart_count < 5: - # Start checker. - dmg_command.check_start() - - # 5. Query the checker and verify that the issue was fixed. - for _ in range(8): - check_query_out = dmg_command.check_query() - if check_query_out["response"]["status"] == "COMPLETED": - repair_reports = check_query_out["response"]["reports"] - break - time.sleep(5) - - if repair_reports: + # 5. Query the checker and verify that the issue was fixed. + for _ in range(8): + check_query_out = dmg_command.check_query() + if check_query_out["response"]["status"] == "COMPLETED": + repair_reports = check_query_out["response"]["reports"] break + time.sleep(5) - self.log.info("Checker didn't detect fault. Restart %d", restart_count) - dmg_command.check_stop() - restart_count += 1 + self.log.info("Checker didn't detect fault. Restart %d", restart_count) + dmg_command.check_stop() + restart_count += 1 if not repair_reports: self.fail("Checker didn't detect or repair any inconsistency!") + errors = [] query_msg = repair_reports[0]["msg"] if "dangling target" not in query_msg: errors.append( From f7329dc60dd73a25a3dfbaa5644aae767e15ceb2 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 24 Jul 2023 13:33:07 +0000 Subject: [PATCH 12/12] DAOS-11736 test: Remove unused variable Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pool_membership Test-repeat: 7 Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/recovery/pool_membership.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 52037e47293..fbeda9712cb 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -263,6 +263,7 @@ def test_dangling_pool_map(self): dmg_command.check_start() # 5. Query the checker and verify that the issue was fixed. + repair_reports = None for _ in range(8): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "COMPLETED": @@ -270,10 +271,6 @@ def test_dangling_pool_map(self): break time.sleep(5) - self.log.info("Checker didn't detect fault. Restart %d", restart_count) - dmg_command.check_stop() - restart_count += 1 - if not repair_reports: self.fail("Checker didn't detect or repair any inconsistency!")