From d80d06eace970c2efe7a1ad545adbf344c2d77b8 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sun, 15 Mar 2026 07:03:40 +0000 Subject: [PATCH 1/4] DAOS-17859 test: recovery/check_start_corner_case.py - Two pools corrupted Test to pass in two pool labels where one is corrupted pool. 1. Create three pools and containers. 2. Inject container bad label fault into all. 3. Enable checker. Set policy to --all-interactive. 4. Call dmg check start pool_1 Invalid. Verify error message. 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed. 6. Call dmg check start pool_2 pool_3. Check that they're both fixed. 7. Disable checker and verify that the three pools were actually fixed. Skip-unit-tests: true Skip-fault-injection-test: true Skip-func-hw-test-medium: false Test-tag: test_two_pools_corrupted Signed-off-by: Makito Kano --- .../ftest/recovery/check_start_corner_case.py | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index 150e1a540e4..fec22402a88 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -271,3 +271,140 @@ def test_two_pools_healthy(self): expected_props = {"label": container.label.value} label_verified = container.verify_prop(expected_props=expected_props) self.assertTrue(label_verified, "Container label isn't fixed!") + + def test_two_pools_corrupted(self): + """Test to pass in two pool labels where one is corrupted pool. + + 1. Create three pools and containers. + 2. Inject container bad label fault into all. + 3. Enable checker. Set policy to --all-interactive. + 4. Call dmg check start pool_1 Invalid. Verify error message. + 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed. + 6. Call dmg check start pool_2 pool_3. Check that they're both fixed. + 7. Disable checker and verify that the three pools were actually fixed. + + Jira ID: DAOS-17859 + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=recovery,cat_recov + :avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_corrupted + """ + # 1. Create three pools and containers. + self.log_step("Create three pools and containers.") + pool_1 = self.get_pool(connect=False) + pool_2 = self.get_pool(connect=False) + pool_3 = self.get_pool(connect=False) + container_1 = self.get_container(pool=pool_1) + container_2 = self.get_container(pool=pool_2) + container_3 = self.get_container(pool=pool_3) + pools = [pool_1, pool_2, pool_3] + containers = [container_1, container_2, container_3] + + # 2. Inject container bad label fault into all. + self.log_step("Inject container bad label fault into all.") + daos_command = self.get_daos_command() + for i in range(len(pools)): + daos_command.faults_container( + pool=pools[i].identifier, cont=containers[i].identifier, + location="DAOS_CHK_CONT_BAD_LABEL") + + # 3. Enable checker. Set policy to --all-interactive. + self.log_step("Enable checker. Set policy to --all-interactive.") + dmg_command = self.get_dmg_command() + dmg_command.check_enable() + dmg_command.check_set_policy(all_interactive=True) + + # 4. Call dmg check start pool_1 Invalid. Verify error message. + self.log_step("Call dmg check start pool_1 Invalid. Verify error message.") + corrupted_invalid = pool_1.identifier + " TestPool_0" + try: + dmg_command.check_start(pool=corrupted_invalid) + self.fail("dmg check start with corrupted and invalid pool labels worked!") + except CommandFailure as command_failure: + exp_msg = "unable to find pool service" + if exp_msg not in str(command_failure): + self.fail(f"{exp_msg} is not in the error message!") + + # 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed. + self.log_step("Call dmg check start pool_1 pool_1.") + corrupted_same = pool_1.identifier + " " + pool_1.identifier + try: + dmg_command.check_start(pool=corrupted_same) + self.log.info("dmg check start with two same corrupted pool labels worked as expected.") + except CommandFailure as command_failure: + msg = (f"dmg check start with two same corrupted pool labels failed! {command_failure}") + self.fail(msg) + + self.log_step("Wait for checker to detect inconsistent container label for pool_1 pool_1.") + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query() + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + if query_reports: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + fault_msg = query_reports[0]["msg"] + expected_fault = "inconsistent container label" + if expected_fault not in fault_msg: + self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}") + + self.log_step("Repair the fault for pool_1 pool_1.") + # Obtain the seq num (ID) to repair. + seq = query_reports[0]["seq"] + # Repair with action 2, which is to use the original container label. + dmg_command.check_repair(seq_num=str(seq), action="2") + + self.log_step("Check that the fault is fixed for pool_1 pool_1.") + wait_for_check_complete(dmg=dmg_command) + # Need to stop before starting again. + dmg_command.check_stop() + + # 6. Call dmg check start pool_2 pool_3. Check that they're both fixed. + self.log_step("Call dmg check start pool_2 pool_3.") + corrupted_diff = pool_2.identifier + " " + pool_3.identifier + # Passing in two different valid labels is a normal use case, so no try-except. + dmg_command.check_start(pool=corrupted_diff) + + self.log_step("Wait for checker to detect inconsistent container label for pool_2 pool_3.") + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query() + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + # We have three corrupted pools, so wait for three reports. + if query_reports and len(query_reports) == 3: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + if len(query_reports) < 3: + self.fail(f"Checker didn't detect 3 inconsistencies! {len(query_reports)}") + # Obtain the seq nums (ID) to repair. + seq_nums = [] + for query_report in query_reports: + if query_report["pool_label"] == pool_2.label.value or \ + query_report["pool_label"] == pool_3.label.value: + seq_nums.append(str(query_report["seq"])) + + self.log_step("Repair with option 2 for pool_2 pool_3.") + for seq_num in seq_nums: + dmg_command.check_repair(seq_num=seq_num, action="2") + + self.log_step("Check that the fault is fixed for pool_2 pool_3.") + wait_for_check_complete(dmg=dmg_command) + + # 7. Disable checker and verify that the three pools were actually fixed. + self.log_step("Disable checker and verify that the three pools were actually fixed.") + dmg_command.check_disable() + for container in containers: + expected_props = {"label": container.label.value} + label_verified = container.verify_prop(expected_props=expected_props) + self.assertTrue(label_verified, f"{container.label.value} label isn't fixed!") From db7bc9b1ce2d1ce66d35e62e93e45ed60c8c729f Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sun, 15 Mar 2026 07:18:15 +0000 Subject: [PATCH 2/4] DAOS-17859 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Skip-func-hw-test-medium: false Test-tag: test_two_pools_corrupted Signed-off-by: Makito Kano --- src/tests/ftest/recovery/check_start_corner_case.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index fec22402a88..ebf71fa6b5c 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -304,10 +304,10 @@ def test_two_pools_corrupted(self): # 2. Inject container bad label fault into all. self.log_step("Inject container bad label fault into all.") daos_command = self.get_daos_command() - for i in range(len(pools)): + for i, pool in enumerate(pools): daos_command.faults_container( - pool=pools[i].identifier, cont=containers[i].identifier, - location="DAOS_CHK_CONT_BAD_LABEL") + pool=pool.identifier, cont=containers[i].identifier, + location="DAOS_CHK_CONT_BAD_LABEL") # 3. Enable checker. Set policy to --all-interactive. self.log_step("Enable checker. Set policy to --all-interactive.") @@ -333,7 +333,7 @@ def test_two_pools_corrupted(self): dmg_command.check_start(pool=corrupted_same) self.log.info("dmg check start with two same corrupted pool labels worked as expected.") except CommandFailure as command_failure: - msg = (f"dmg check start with two same corrupted pool labels failed! {command_failure}") + msg = f"dmg check start with two same corrupted pool labels failed! {command_failure}" self.fail(msg) self.log_step("Wait for checker to detect inconsistent container label for pool_1 pool_1.") @@ -391,7 +391,7 @@ def test_two_pools_corrupted(self): seq_nums = [] for query_report in query_reports: if query_report["pool_label"] == pool_2.label.value or \ - query_report["pool_label"] == pool_3.label.value: + query_report["pool_label"] == pool_3.label.value: seq_nums.append(str(query_report["seq"])) self.log_step("Repair with option 2 for pool_2 pool_3.") From 8f3dadea3214922f8981f499dfb53950c37b454e Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sun, 15 Mar 2026 07:24:04 +0000 Subject: [PATCH 3/4] DAOS-17859 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Skip-func-hw-test-medium: false Test-tag: test_two_pools_corrupted Signed-off-by: Makito Kano --- src/tests/ftest/recovery/check_start_corner_case.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index ebf71fa6b5c..df084f21c17 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -306,8 +306,8 @@ def test_two_pools_corrupted(self): daos_command = self.get_daos_command() for i, pool in enumerate(pools): daos_command.faults_container( - pool=pool.identifier, cont=containers[i].identifier, - location="DAOS_CHK_CONT_BAD_LABEL") + pool=pool.identifier, cont=containers[i].identifier, + location="DAOS_CHK_CONT_BAD_LABEL") # 3. Enable checker. Set policy to --all-interactive. self.log_step("Enable checker. Set policy to --all-interactive.") @@ -391,8 +391,8 @@ def test_two_pools_corrupted(self): seq_nums = [] for query_report in query_reports: if query_report["pool_label"] == pool_2.label.value or \ - query_report["pool_label"] == pool_3.label.value: - seq_nums.append(str(query_report["seq"])) + query_report["pool_label"] == pool_3.label.value: + seq_nums.append(str(query_report["seq"])) self.log_step("Repair with option 2 for pool_2 pool_3.") for seq_num in seq_nums: From 01849b77b62bdb32a9b5330be85851ee5cf71893 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sun, 15 Mar 2026 07:30:36 +0000 Subject: [PATCH 4/4] DAOS-17859 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Skip-func-hw-test-medium: false Test-tag: test_two_pools_corrupted Signed-off-by: Makito Kano --- src/tests/ftest/recovery/check_start_corner_case.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index df084f21c17..bbad0f4ccf3 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -390,9 +390,8 @@ def test_two_pools_corrupted(self): # Obtain the seq nums (ID) to repair. seq_nums = [] for query_report in query_reports: - if query_report["pool_label"] == pool_2.label.value or \ - query_report["pool_label"] == pool_3.label.value: - seq_nums.append(str(query_report["seq"])) + if query_report["pool_label"] in (pool_2.label.value, pool_3.label.value): + seq_nums.append(str(query_report["seq"])) self.log_step("Repair with option 2 for pool_2 pool_3.") for seq_num in seq_nums: