Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions src/tests/ftest/recovery/check_start_corner_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,139 @@ def test_two_pools_healthy(self):
expected_props = {"label": container.label.value}
label_verified = container.verify_prop(expected_props=expected_props)
self.assertTrue(label_verified, "Container label isn't fixed!")

def test_two_pools_corrupted(self):
"""Test to pass in two pool labels where one is corrupted pool.

1. Create three pools and containers.
2. Inject container bad label fault into all.
3. Enable checker. Set policy to --all-interactive.
4. Call dmg check start pool_1 Invalid. Verify error message.
5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed.
6. Call dmg check start pool_2 pool_3. Check that they're both fixed.
7. Disable checker and verify that the three pools were actually fixed.

Jira ID: DAOS-17859

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=recovery,cat_recov
:avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_corrupted
"""
# 1. Create three pools and containers.
self.log_step("Create three pools and containers.")
pool_1 = self.get_pool(connect=False)
pool_2 = self.get_pool(connect=False)
pool_3 = self.get_pool(connect=False)
container_1 = self.get_container(pool=pool_1)
container_2 = self.get_container(pool=pool_2)
container_3 = self.get_container(pool=pool_3)
pools = [pool_1, pool_2, pool_3]
containers = [container_1, container_2, container_3]

# 2. Inject container bad label fault into all.
self.log_step("Inject container bad label fault into all.")
daos_command = self.get_daos_command()
for i, pool in enumerate(pools):
daos_command.faults_container(
pool=pool.identifier, cont=containers[i].identifier,
location="DAOS_CHK_CONT_BAD_LABEL")

# 3. Enable checker. Set policy to --all-interactive.
self.log_step("Enable checker. Set policy to --all-interactive.")
dmg_command = self.get_dmg_command()
dmg_command.check_enable()
dmg_command.check_set_policy(all_interactive=True)

# 4. Call dmg check start pool_1 Invalid. Verify error message.
self.log_step("Call dmg check start pool_1 Invalid. Verify error message.")
corrupted_invalid = pool_1.identifier + " TestPool_0"
try:
dmg_command.check_start(pool=corrupted_invalid)
self.fail("dmg check start with corrupted and invalid pool labels worked!")
except CommandFailure as command_failure:
exp_msg = "unable to find pool service"
if exp_msg not in str(command_failure):
self.fail(f"{exp_msg} is not in the error message!")

# 5. Call dmg check start pool_1 pool_1. Repair and check that it's fixed.
self.log_step("Call dmg check start pool_1 pool_1.")
corrupted_same = pool_1.identifier + " " + pool_1.identifier
try:
dmg_command.check_start(pool=corrupted_same)
self.log.info("dmg check start with two same corrupted pool labels worked as expected.")
except CommandFailure as command_failure:
msg = f"dmg check start with two same corrupted pool labels failed! {command_failure}"
self.fail(msg)

self.log_step("Wait for checker to detect inconsistent container label for pool_1 pool_1.")
query_reports = None
for _ in range(8):
check_query_out = dmg_command.check_query()
# Status becomes RUNNING immediately, but it may take a while to detect the
# inconsistency. If detected, "reports" field is filled.
if check_query_out["response"]["status"] == "RUNNING":
query_reports = check_query_out["response"]["reports"]
if query_reports:
break
time.sleep(5)
if not query_reports:
self.fail("Checker didn't detect any inconsistency!")
fault_msg = query_reports[0]["msg"]
expected_fault = "inconsistent container label"
if expected_fault not in fault_msg:
self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}")

self.log_step("Repair the fault for pool_1 pool_1.")
# Obtain the seq num (ID) to repair.
seq = query_reports[0]["seq"]
# Repair with action 2, which is to use the original container label.
dmg_command.check_repair(seq_num=str(seq), action="2")

self.log_step("Check that the fault is fixed for pool_1 pool_1.")
wait_for_check_complete(dmg=dmg_command)
# Need to stop before starting again.
dmg_command.check_stop()

# 6. Call dmg check start pool_2 pool_3. Check that they're both fixed.
self.log_step("Call dmg check start pool_2 pool_3.")
corrupted_diff = pool_2.identifier + " " + pool_3.identifier
# Passing in two different valid labels is a normal use case, so no try-except.
dmg_command.check_start(pool=corrupted_diff)

self.log_step("Wait for checker to detect inconsistent container label for pool_2 pool_3.")
query_reports = None
for _ in range(8):
check_query_out = dmg_command.check_query()
# Status becomes RUNNING immediately, but it may take a while to detect the
# inconsistency. If detected, "reports" field is filled.
if check_query_out["response"]["status"] == "RUNNING":
query_reports = check_query_out["response"]["reports"]
# We have three corrupted pools, so wait for three reports.
if query_reports and len(query_reports) == 3:
break
time.sleep(5)
if not query_reports:
self.fail("Checker didn't detect any inconsistency!")
if len(query_reports) < 3:
self.fail(f"Checker didn't detect 3 inconsistencies! {len(query_reports)}")
# Obtain the seq nums (ID) to repair.
seq_nums = []
for query_report in query_reports:
if query_report["pool_label"] in (pool_2.label.value, pool_3.label.value):
seq_nums.append(str(query_report["seq"]))

self.log_step("Repair with option 2 for pool_2 pool_3.")
for seq_num in seq_nums:
dmg_command.check_repair(seq_num=seq_num, action="2")

self.log_step("Check that the fault is fixed for pool_2 pool_3.")
wait_for_check_complete(dmg=dmg_command)

# 7. Disable checker and verify that the three pools were actually fixed.
self.log_step("Disable checker and verify that the three pools were actually fixed.")
dmg_command.check_disable()
for container in containers:
expected_props = {"label": container.label.value}
label_verified = container.verify_prop(expected_props=expected_props)
self.assertTrue(label_verified, f"{container.label.value} label isn't fixed!")
Loading