diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index 150e1a540e4..bbad0f4ccf3 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -271,3 +271,139 @@ def test_two_pools_healthy(self): expected_props = {"label": container.label.value} label_verified = container.verify_prop(expected_props=expected_props) self.assertTrue(label_verified, "Container label isn't fixed!") + + def test_two_pools_corrupted(self): + """Test to pass in two pool labels where one is corrupted pool. + + 1. Create three pools and containers. + 2. Inject container bad label fault into all. + 3. Enable checker. Set policy to --all-interactive. + 4. Call dmg check start pool_1 Invalid. Verify error message. + 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed. + 6. Call dmg check start pool_2 pool_3. Check that they're both fixed. + 7. Disable checker and verify that the three pools were actually fixed. + + Jira ID: DAOS-17859 + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=recovery,cat_recov + :avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_corrupted + """ + # 1. Create three pools and containers. + self.log_step("Create three pools and containers.") + pool_1 = self.get_pool(connect=False) + pool_2 = self.get_pool(connect=False) + pool_3 = self.get_pool(connect=False) + container_1 = self.get_container(pool=pool_1) + container_2 = self.get_container(pool=pool_2) + container_3 = self.get_container(pool=pool_3) + pools = [pool_1, pool_2, pool_3] + containers = [container_1, container_2, container_3] + + # 2. Inject container bad label fault into all. + self.log_step("Inject container bad label fault into all.") + daos_command = self.get_daos_command() + for i, pool in enumerate(pools): + daos_command.faults_container( + pool=pool.identifier, cont=containers[i].identifier, + location="DAOS_CHK_CONT_BAD_LABEL") + + # 3. Enable checker. Set policy to --all-interactive. + self.log_step("Enable checker. Set policy to --all-interactive.") + dmg_command = self.get_dmg_command() + dmg_command.check_enable() + dmg_command.check_set_policy(all_interactive=True) + + # 4. Call dmg check start pool_1 Invalid. Verify error message. + self.log_step("Call dmg check start pool_1 Invalid. Verify error message.") + corrupted_invalid = pool_1.identifier + " TestPool_0" + try: + dmg_command.check_start(pool=corrupted_invalid) + self.fail("dmg check start with corrupted and invalid pool labels worked!") + except CommandFailure as command_failure: + exp_msg = "unable to find pool service" + if exp_msg not in str(command_failure): + self.fail(f"{exp_msg} is not in the error message!") + + # 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed. + self.log_step("Call dmg check start pool_1 pool_1.") + corrupted_same = pool_1.identifier + " " + pool_1.identifier + try: + dmg_command.check_start(pool=corrupted_same) + self.log.info("dmg check start with two same corrupted pool labels worked as expected.") + except CommandFailure as command_failure: + msg = f"dmg check start with two same corrupted pool labels failed! {command_failure}" + self.fail(msg) + + self.log_step("Wait for checker to detect inconsistent container label for pool_1 pool_1.") + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query() + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + if query_reports: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + fault_msg = query_reports[0]["msg"] + expected_fault = "inconsistent container label" + if expected_fault not in fault_msg: + self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}") + + self.log_step("Repair the fault for pool_1 pool_1.") + # Obtain the seq num (ID) to repair. + seq = query_reports[0]["seq"] + # Repair with action 2, which is to use the original container label. + dmg_command.check_repair(seq_num=str(seq), action="2") + + self.log_step("Check that the fault is fixed for pool_1 pool_1.") + wait_for_check_complete(dmg=dmg_command) + # Need to stop before starting again. + dmg_command.check_stop() + + # 6. Call dmg check start pool_2 pool_3. Check that they're both fixed. + self.log_step("Call dmg check start pool_2 pool_3.") + corrupted_diff = pool_2.identifier + " " + pool_3.identifier + # Passing in two different valid labels is a normal use case, so no try-except. + dmg_command.check_start(pool=corrupted_diff) + + self.log_step("Wait for checker to detect inconsistent container label for pool_2 pool_3.") + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query() + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + # We have three corrupted pools, so wait for three reports. + if query_reports and len(query_reports) == 3: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + if len(query_reports) < 3: + self.fail(f"Checker didn't detect 3 inconsistencies! {len(query_reports)}") + # Obtain the seq nums (ID) to repair. + seq_nums = [] + for query_report in query_reports: + if query_report["pool_label"] in (pool_2.label.value, pool_3.label.value): + seq_nums.append(str(query_report["seq"])) + + self.log_step("Repair with option 2 for pool_2 pool_3.") + for seq_num in seq_nums: + dmg_command.check_repair(seq_num=seq_num, action="2") + + self.log_step("Check that the fault is fixed for pool_2 pool_3.") + wait_for_check_complete(dmg=dmg_command) + + # 7. Disable checker and verify that the three pools were actually fixed. + self.log_step("Disable checker and verify that the three pools were actually fixed.") + dmg_command.check_disable() + for container in containers: + expected_props = {"label": container.label.value} + label_verified = container.verify_prop(expected_props=expected_props) + self.assertTrue(label_verified, f"{container.label.value} label isn't fixed!")