diff --git a/src/include/daos_srv/rebuild.h b/src/include/daos_srv/rebuild.h index a161ba8d5ad..704fe2e1518 100644 --- a/src/include/daos_srv/rebuild.h +++ b/src/include/daos_srv/rebuild.h @@ -30,6 +30,18 @@ typedef enum { RB_OP_NONE = 0xffff, } daos_rebuild_opc_t; +/** + * Bitmask values representing the pool operation(s) that caused a rebuild to + * be triggered. Multiple causes can be OR-ed together when tasks are merged. + */ +#define RB_CAUSE_EXCLUDE (1U << 0) +#define RB_CAUSE_DRAIN (1U << 1) +#define RB_CAUSE_REINT (1U << 2) +#define RB_CAUSE_EXTEND (1U << 3) + +/** Only an exclude-triggered rebuild does NOT require a follow-up RECLAIM. */ +#define RB_CAUSE_NO_RECLAIM (RB_CAUSE_EXCLUDE) + #define RB_OP_STR(rb_op) ((rb_op) == RB_OP_REBUILD ? "Rebuild" : \ (rb_op) == RB_OP_RECLAIM ? "Reclaim" : \ (rb_op) == RB_OP_FAIL_RECLAIM ? "Reclaim fail" : \ @@ -87,8 +99,9 @@ typedef enum { int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph, uint32_t layout_version, struct pool_target_id_list *tgts, - daos_rebuild_opc_t rebuild_op, daos_rebuild_opc_t retry_rebuild_op, - uint32_t retry_map_ver, bool stop_admin, void *cur_taskp, uint64_t delay_sec); + daos_rebuild_opc_t rebuild_op, uint32_t rebuild_cause, + daos_rebuild_opc_t retry_rebuild_op, uint32_t retry_map_ver, bool stop_admin, + void *cur_taskp, uint64_t delay_sec); void ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank); int ds_rebuild_query(uuid_t pool_uuid, struct daos_rebuild_status *status); diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index c2a59cec0a9..6df05179fc0 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -6484,6 +6484,7 @@ pool_check_upgrade_object_layout(struct rdb_tx *tx, struct pool_svc *svc, if (current_layout_ver < DAOS_POOL_OBJ_VERSION) { rc = ds_rebuild_schedule(svc->ps_pool, svc->ps_pool->sp_map_version, upgrade_eph, DAOS_POOL_OBJ_VERSION, NULL, RB_OP_UPGRADE, + 0 /* cause: N/A for upgrade */, RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, false /* stop_admin */, NULL /* cur_taskp */, 0); if (rc == 0) @@ -7900,10 +7901,28 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, tgt_map_ver); if (tgt_map_ver != 0) { + uint32_t rebuild_cause; + + /* + * Translate the pool-map operation code into the rebuild-cause + * bitmask so that the rebuild machinery can decide (among other + * things) whether a follow-up RECLAIM is necessary. + */ + if (opc == MAP_EXCLUDE) + rebuild_cause = RB_CAUSE_EXCLUDE; + else if (opc == MAP_DRAIN) + rebuild_cause = RB_CAUSE_DRAIN; + else if (opc == MAP_REINT || opc == MAP_ADD_IN) + rebuild_cause = RB_CAUSE_REINT; + else if (opc == MAP_EXTEND) + rebuild_cause = RB_CAUSE_EXTEND; + else + rebuild_cause = 0; /* unknown – be conservative */ + rc = ds_rebuild_schedule(svc->ps_pool, tgt_map_ver, rebuild_eph, 0, &target_list, - RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, - NULL /* cur_taskp */, delay); + RB_OP_REBUILD, rebuild_cause, + RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, + false /* stop_admin */, NULL /* cur_taskp */, delay); if (rc != 0) { D_ERROR("rebuild fails rc: "DF_RC"\n", DP_RC(rc)); D_GOTO(out, rc); diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index c00b7d90517..b53767289df 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -239,6 +239,14 @@ struct rebuild_task { struct pool_target_id_list dst_tgts; daos_rebuild_opc_t dst_rebuild_op; + /** + * Bitmask of RB_CAUSE_* flags describing what pool operation(s) caused + * this rebuild task to be scheduled. When tasks are merged the causes + * are OR-ed together. A value of 0 means "unknown" and is treated + * conservatively (RECLAIM will be scheduled). + */ + uint32_t dst_rebuild_cause; + /* Epoch to use for reclaim job for discarding the data * of half-rebuild/reintegrated job. */ diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 91185bb5cc3..b58039f3393 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1611,9 +1611,8 @@ rebuild_task_get_min_version(struct pool_map *map, struct pool_target_id_list *t * Other return value indicates an error. */ static int -rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver, - daos_rebuild_opc_t rebuild_op, - struct pool_target_id_list *tgts, uint64_t delay_sec) +rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver, daos_rebuild_opc_t rebuild_op, + uint32_t rebuild_cause, struct pool_target_id_list *tgts, uint64_t delay_sec) { struct rebuild_task *task; struct rebuild_task *merge_pre_task = NULL; @@ -1694,13 +1693,20 @@ rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver, merge_task->dst_map_ver = map_ver; } + /* + * OR the causes together. If any merged task is triggered by an + * operation that requires RECLAIM (e.g. reintegrate, extend), the + * combined task will still schedule RECLAIM on completion. + */ + merge_task->dst_rebuild_cause |= rebuild_cause; + merge_task->dst_schedule_time = max(merge_task->dst_schedule_time, daos_gettime_coarse() + delay_sec); merge_task->dst_reclaim_ver = rebuild_task_get_min_version(pool->sp_map, tgts); - D_PRINT("%s [%s] ("DF_UUID" ver=%u/%u) id %u\n", - RB_OP_STR(rebuild_op), merge_task->dst_schedule_time == -1 ? - "queued/delayed" : "queued", DP_UUID(pool->sp_uuid), map_ver, - merge_task->dst_reclaim_ver, tgts->pti_ids[0].pti_id); + D_PRINT("%s [%s] (" DF_UUID " ver=%u/%u) id %u cause=0x%x\n", RB_OP_STR(rebuild_op), + merge_task->dst_schedule_time == -1 ? "queued/delayed" : "queued", + DP_UUID(pool->sp_uuid), map_ver, merge_task->dst_reclaim_ver, + tgts->pti_ids[0].pti_id, merge_task->dst_rebuild_cause); /* Print out the current queue to the debug log */ rebuild_debug_print_queue(); @@ -1863,10 +1869,11 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, return 0; } - rc = ds_rebuild_schedule( - pool, task->dst_map_ver, task->dst_reclaim_eph, task->dst_new_layout_version, - &task->dst_tgts, task->dst_rebuild_op, task->dst_retry_rebuild_op, - task->dst_retry_map_ver, task->dst_stop_admin, NULL /* cur_taskp */, delay_sec); + rc = ds_rebuild_schedule(pool, task->dst_map_ver, task->dst_reclaim_eph, + task->dst_new_layout_version, &task->dst_tgts, + task->dst_rebuild_op, task->dst_rebuild_cause, + task->dst_retry_rebuild_op, task->dst_retry_map_ver, + task->dst_stop_admin, NULL /* cur_taskp */, delay_sec); DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_UUID ": reschedule not-started %u(%s)", DP_UUID(task->dst_pool_uuid), task->dst_rebuild_op, RB_OP_STR(task->dst_rebuild_op)); @@ -1896,11 +1903,11 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, /* reclaim or fail_reclaim failed - retry */ if (task->dst_rebuild_op == RB_OP_RECLAIM || task->dst_rebuild_op == RB_OP_FAIL_RECLAIM) { - rc = ds_rebuild_schedule(pool, task->dst_map_ver, rgt->rgt_stable_epoch, - task->dst_new_layout_version, &task->dst_tgts, - task->dst_rebuild_op, task->dst_retry_rebuild_op, - task->dst_retry_map_ver, task->dst_stop_admin, - task, delay_sec); + rc = ds_rebuild_schedule( + pool, task->dst_map_ver, rgt->rgt_stable_epoch, + task->dst_new_layout_version, &task->dst_tgts, task->dst_rebuild_op, + task->dst_rebuild_cause, task->dst_retry_rebuild_op, + task->dst_retry_map_ver, task->dst_stop_admin, task, delay_sec); DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule retry %u(%s)", DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), task->dst_rebuild_op, @@ -1920,7 +1927,7 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, rc = ds_rebuild_schedule( pool, task->dst_reclaim_ver - 1 /* map_ver */, rgt->rgt_stable_epoch, task->dst_new_layout_version, - &task->dst_tgts, RB_OP_FAIL_RECLAIM, + &task->dst_tgts, RB_OP_FAIL_RECLAIM, task->dst_rebuild_cause, task->dst_rebuild_op /* retry_rebuild_op */, task->dst_map_ver /* retry_map_ver */, rgt->rgt_stop_admin, task, delay_sec); @@ -1938,8 +1945,9 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, rc = ds_rebuild_schedule( pool, task->dst_reclaim_ver - 1 /* map_ver */, rgt->rgt_stable_epoch, task->dst_new_layout_version, &task->dst_tgts, RB_OP_FAIL_RECLAIM, - retry_opc /* retry_rebuild_op */, task->dst_map_ver /* retry_map_ver */, - rgt->rgt_stop_admin, task, delay_sec); + task->dst_rebuild_cause, retry_opc /* retry_rebuild_op */, + task->dst_map_ver /* retry_map_ver */, rgt->rgt_stop_admin, task, + delay_sec); DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule %u(%s)", DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), RB_OP_FAIL_RECLAIM, @@ -1955,27 +1963,50 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, if (retry_opc == RB_OP_NONE) D_GOTO(complete, rc); - rc = ds_rebuild_schedule( - pool, task->dst_map_ver, rgt->rgt_stable_epoch, task->dst_new_layout_version, - &task->dst_tgts, retry_opc /* rebuild_op*/, 0 /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec); + rc = ds_rebuild_schedule(pool, task->dst_map_ver, rgt->rgt_stable_epoch, + task->dst_new_layout_version, &task->dst_tgts, + retry_opc /* rebuild_op*/, task->dst_rebuild_cause, + 0 /* retry_rebuild_op */, 0 /* retry_map_ver */, + false /* stop_admin */, task, delay_sec); DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule retry of original %u(%s)", DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), retry_opc, RB_OP_STR(retry_opc)); } else if (task->dst_rebuild_op == RB_OP_REBUILD || task->dst_rebuild_op == RB_OP_UPGRADE) { - /* Schedule reclaim for successful op:Rebuild - * (exclude/drain/reintegrate/extend/upgrade). */ - rgt->rgt_status.rs_state = DRS_IN_PROGRESS; - - obj_reclaim_ver = obj_reclaim_ver > 0 ? obj_reclaim_ver : task->dst_map_ver; - rc = ds_rebuild_schedule( - pool, obj_reclaim_ver, rgt->rgt_reclaim_epoch, task->dst_new_layout_version, - &task->dst_tgts, RB_OP_RECLAIM, RB_OP_NONE /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec); - DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule %u(%s)", - DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), RB_OP_RECLAIM, - RB_OP_STR(RB_OP_RECLAIM)); + /* + * Schedule reclaim for a successful op:Rebuild + * (exclude/drain/reintegrate/extend/upgrade). + * + * Optimisation: if the rebuild was triggered solely by an + * exclude operation, the excluded target is gone permanently + * and its data has already been migrated; there are no stale + * copies left to reclaim. Skip scheduling RB_OP_RECLAIM. + * + * All other causes (drain, reintegrate, extend) still require + * a follow-up RECLAIM. If dst_rebuild_cause is 0 (unknown, + * e.g. regenerated after a leader switch) we conservatively + * schedule RECLAIM. + * + * RB_OP_UPGRADE always needs RECLAIM regardless of cause. + */ + if (task->dst_rebuild_op == RB_OP_UPGRADE || task->dst_rebuild_cause == 0 || + (task->dst_rebuild_cause & ~RB_CAUSE_NO_RECLAIM) != 0) { + rgt->rgt_status.rs_state = DRS_IN_PROGRESS; + + obj_reclaim_ver = obj_reclaim_ver > 0 ? obj_reclaim_ver : task->dst_map_ver; + rc = ds_rebuild_schedule( + pool, obj_reclaim_ver, rgt->rgt_reclaim_epoch, + task->dst_new_layout_version, &task->dst_tgts, RB_OP_RECLAIM, + task->dst_rebuild_cause, RB_OP_NONE /* retry_rebuild_op */, + 0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec); + DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, + DF_RB ": errno " DF_RC ", schedule %u(%s)", DP_RB_RGT(rgt), + DP_RC(rgt->rgt_status.rs_errno), RB_OP_RECLAIM, + RB_OP_STR(RB_OP_RECLAIM)); + } else { + D_INFO(DF_RB ": skip RECLAIM (cause=0x%x, exclude only)\n", DP_RB_RGT(rgt), + task->dst_rebuild_cause); + } } complete: @@ -2011,9 +2042,9 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, rc = ds_rebuild_schedule( pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch, task->dst_new_layout_version, &task->dst_tgts, - task->dst_retry_rebuild_op, RB_OP_NONE /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, task, - -1 /* delay_sec */); + task->dst_retry_rebuild_op, task->dst_rebuild_cause, + RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, + false /* stop_admin */, task, -1 /* delay_sec */); DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule retry %u(%s) with delay -1", DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), @@ -2025,7 +2056,7 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, /* Fail_reclaim done (and a stop command wasn't received during) - retry rebuild. */ rc1 = ds_rebuild_schedule(pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch, task->dst_new_layout_version, &task->dst_tgts, - task->dst_retry_rebuild_op, + task->dst_retry_rebuild_op, task->dst_rebuild_cause, RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec); DL_CDEBUG(rc1, DLOG_ERR, DLOG_INFO, rc, @@ -2465,8 +2496,9 @@ rebuild_print_list_update(const uuid_t uuid, const uint32_t map_ver, int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t reclaim_eph, uint32_t layout_version, struct pool_target_id_list *tgts, - daos_rebuild_opc_t rebuild_op, daos_rebuild_opc_t retry_rebuild_op, - uint32_t retry_map_ver, bool stop_admin, void *cur_taskp, uint64_t delay_sec) + daos_rebuild_opc_t rebuild_op, uint32_t rebuild_cause, + daos_rebuild_opc_t retry_rebuild_op, uint32_t retry_map_ver, bool stop_admin, + void *cur_taskp, uint64_t delay_sec) { struct rebuild_task *cur_task = cur_taskp; struct rebuild_task *new_task; @@ -2492,7 +2524,8 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t reclaim if (tgts != NULL && tgts->pti_number > 0 && rebuild_op != RB_OP_RECLAIM && rebuild_op != RB_OP_FAIL_RECLAIM) { /* Check if the pool already in the queue list */ - rc = rebuild_try_merge_tgts(pool, map_ver, rebuild_op, tgts, delay_sec); + rc = rebuild_try_merge_tgts(pool, map_ver, rebuild_op, rebuild_cause, tgts, + delay_sec); if (rc) return rc == 1 ? 0 : rc; } @@ -2514,6 +2547,16 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t reclaim new_task->dst_rebuild_op = rebuild_op; new_task->dst_reclaim_eph = reclaim_eph; new_task->dst_new_layout_version = layout_version; + /* + * Propagate or initialise the rebuild cause bitmask. + * For derived operations (RECLAIM, FAIL_RECLAIM, retry of REBUILD) we + * carry the cause forward from the originating task so that the + * RECLAIM-skip optimisation can still be applied correctly. + */ + if (cur_task != NULL) + new_task->dst_rebuild_cause = cur_task->dst_rebuild_cause | rebuild_cause; + else + new_task->dst_rebuild_cause = rebuild_cause; if (rebuild_op == RB_OP_FAIL_RECLAIM) { new_task->dst_retry_map_ver = retry_map_ver; new_task->dst_retry_rebuild_op = retry_rebuild_op; @@ -2663,18 +2706,18 @@ regenerate_task_internal(struct ds_pool *pool, struct pool_target *tgts, id_list.pti_number = 1; if (tgt->ta_comp.co_status & (PO_COMP_ST_DOWN | PO_COMP_ST_DRAIN)) { - rc = ds_rebuild_schedule(pool, tgt->ta_comp.co_fseq, - current_eph == 0 ? eph : current_eph, 0, &id_list, - RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, - NULL /* cur_taskp */, delay); + rc = ds_rebuild_schedule( + pool, tgt->ta_comp.co_fseq, current_eph == 0 ? eph : current_eph, 0, + &id_list, RB_OP_REBUILD, 0 /* cause unknown after leader switch */, + RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, + false /* stop_admin */, NULL /* cur_taskp */, delay); } else { D_ASSERT(tgt->ta_comp.co_status == PO_COMP_ST_UP); - rc = ds_rebuild_schedule(pool, tgt->ta_comp.co_in_ver, - current_eph == 0 ? eph : current_eph, 0, &id_list, - RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */, - 0 /* retry_map_ver */, false /* stop_admin */, - NULL /* cur_taskp */, delay); + rc = ds_rebuild_schedule( + pool, tgt->ta_comp.co_in_ver, current_eph == 0 ? eph : current_eph, 0, + &id_list, RB_OP_REBUILD, 0 /* cause unknown after leader switch */, + RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */, + false /* stop_admin */, NULL /* cur_taskp */, delay); } if (rc) { D_ERROR(DF_UUID" schedule ver %d failed: "DF_RC"\n",