Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/include/daos_srv/rebuild.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ typedef enum {
RB_OP_NONE = 0xffff,
} daos_rebuild_opc_t;

/**
* Bitmask values representing the pool operation(s) that caused a rebuild to
* be triggered. Multiple causes can be OR-ed together when tasks are merged.
*/
#define RB_CAUSE_EXCLUDE (1U << 0)
#define RB_CAUSE_DRAIN (1U << 1)
#define RB_CAUSE_REINT (1U << 2)
#define RB_CAUSE_EXTEND (1U << 3)

/** Only an exclude-triggered rebuild does NOT require a follow-up RECLAIM. */
#define RB_CAUSE_NO_RECLAIM (RB_CAUSE_EXCLUDE)

#define RB_OP_STR(rb_op) ((rb_op) == RB_OP_REBUILD ? "Rebuild" : \
(rb_op) == RB_OP_RECLAIM ? "Reclaim" : \
(rb_op) == RB_OP_FAIL_RECLAIM ? "Reclaim fail" : \
Expand Down Expand Up @@ -87,8 +99,9 @@ typedef enum {
int
ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph,
uint32_t layout_version, struct pool_target_id_list *tgts,
daos_rebuild_opc_t rebuild_op, daos_rebuild_opc_t retry_rebuild_op,
uint32_t retry_map_ver, bool stop_admin, void *cur_taskp, uint64_t delay_sec);
daos_rebuild_opc_t rebuild_op, uint32_t rebuild_cause,
daos_rebuild_opc_t retry_rebuild_op, uint32_t retry_map_ver, bool stop_admin,
void *cur_taskp, uint64_t delay_sec);
void ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank);
int ds_rebuild_query(uuid_t pool_uuid,
struct daos_rebuild_status *status);
Expand Down
25 changes: 22 additions & 3 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -6484,6 +6484,7 @@ pool_check_upgrade_object_layout(struct rdb_tx *tx, struct pool_svc *svc,
if (current_layout_ver < DAOS_POOL_OBJ_VERSION) {
rc = ds_rebuild_schedule(svc->ps_pool, svc->ps_pool->sp_map_version, upgrade_eph,
DAOS_POOL_OBJ_VERSION, NULL, RB_OP_UPGRADE,
0 /* cause: N/A for upgrade */,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, NULL /* cur_taskp */, 0);
if (rc == 0)
Expand Down Expand Up @@ -7900,10 +7901,28 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
tgt_map_ver);

if (tgt_map_ver != 0) {
uint32_t rebuild_cause;

/*
* Translate the pool-map operation code into the rebuild-cause
* bitmask so that the rebuild machinery can decide (among other
* things) whether a follow-up RECLAIM is necessary.
*/
if (opc == MAP_EXCLUDE)
rebuild_cause = RB_CAUSE_EXCLUDE;
else if (opc == MAP_DRAIN)
rebuild_cause = RB_CAUSE_DRAIN;
else if (opc == MAP_REINT || opc == MAP_ADD_IN)
rebuild_cause = RB_CAUSE_REINT;
else if (opc == MAP_EXTEND)
rebuild_cause = RB_CAUSE_EXTEND;
else
rebuild_cause = 0; /* unknown – be conservative */

rc = ds_rebuild_schedule(svc->ps_pool, tgt_map_ver, rebuild_eph, 0, &target_list,
RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */,
NULL /* cur_taskp */, delay);
RB_OP_REBUILD, rebuild_cause,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, NULL /* cur_taskp */, delay);
if (rc != 0) {
D_ERROR("rebuild fails rc: "DF_RC"\n", DP_RC(rc));
D_GOTO(out, rc);
Expand Down
10 changes: 9 additions & 1 deletion src/rebuild/rebuild_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2017-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -239,6 +239,14 @@ struct rebuild_task {
struct pool_target_id_list dst_tgts;
daos_rebuild_opc_t dst_rebuild_op;

/**
* Bitmask of RB_CAUSE_* flags describing what pool operation(s) caused
* this rebuild task to be scheduled. When tasks are merged the causes
* are OR-ed together. A value of 0 means "unknown" and is treated
* conservatively (RECLAIM will be scheduled).
*/
uint32_t dst_rebuild_cause;

/* Epoch to use for reclaim job for discarding the data
* of half-rebuild/reintegrated job.
*/
Expand Down
147 changes: 95 additions & 52 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1611,9 +1611,8 @@
* Other return value indicates an error.
*/
static int
rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver,
daos_rebuild_opc_t rebuild_op,
struct pool_target_id_list *tgts, uint64_t delay_sec)
rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver, daos_rebuild_opc_t rebuild_op,
uint32_t rebuild_cause, struct pool_target_id_list *tgts, uint64_t delay_sec)
{
struct rebuild_task *task;
struct rebuild_task *merge_pre_task = NULL;
Expand Down Expand Up @@ -1694,13 +1693,20 @@
merge_task->dst_map_ver = map_ver;
}

/*
* OR the causes together. If any merged task is triggered by an
* operation that requires RECLAIM (e.g. reintegrate, extend), the
* combined task will still schedule RECLAIM on completion.
*/
merge_task->dst_rebuild_cause |= rebuild_cause;

merge_task->dst_schedule_time = max(merge_task->dst_schedule_time,
daos_gettime_coarse() + delay_sec);
merge_task->dst_reclaim_ver = rebuild_task_get_min_version(pool->sp_map, tgts);
D_PRINT("%s [%s] ("DF_UUID" ver=%u/%u) id %u\n",
RB_OP_STR(rebuild_op), merge_task->dst_schedule_time == -1 ?
"queued/delayed" : "queued", DP_UUID(pool->sp_uuid), map_ver,
merge_task->dst_reclaim_ver, tgts->pti_ids[0].pti_id);
D_PRINT("%s [%s] (" DF_UUID " ver=%u/%u) id %u cause=0x%x\n", RB_OP_STR(rebuild_op),
merge_task->dst_schedule_time == -1 ? "queued/delayed" : "queued",
DP_UUID(pool->sp_uuid), map_ver, merge_task->dst_reclaim_ver,
tgts->pti_ids[0].pti_id, merge_task->dst_rebuild_cause);

/* Print out the current queue to the debug log */
rebuild_debug_print_queue();
Expand Down Expand Up @@ -1863,10 +1869,11 @@
return 0;
}

rc = ds_rebuild_schedule(
pool, task->dst_map_ver, task->dst_reclaim_eph, task->dst_new_layout_version,
&task->dst_tgts, task->dst_rebuild_op, task->dst_retry_rebuild_op,
task->dst_retry_map_ver, task->dst_stop_admin, NULL /* cur_taskp */, delay_sec);
rc = ds_rebuild_schedule(pool, task->dst_map_ver, task->dst_reclaim_eph,
task->dst_new_layout_version, &task->dst_tgts,
task->dst_rebuild_op, task->dst_rebuild_cause,
task->dst_retry_rebuild_op, task->dst_retry_map_ver,
task->dst_stop_admin, NULL /* cur_taskp */, delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_UUID ": reschedule not-started %u(%s)",
DP_UUID(task->dst_pool_uuid), task->dst_rebuild_op,
RB_OP_STR(task->dst_rebuild_op));
Expand Down Expand Up @@ -1896,11 +1903,11 @@
/* reclaim or fail_reclaim failed - retry */
if (task->dst_rebuild_op == RB_OP_RECLAIM ||
task->dst_rebuild_op == RB_OP_FAIL_RECLAIM) {
rc = ds_rebuild_schedule(pool, task->dst_map_ver, rgt->rgt_stable_epoch,
task->dst_new_layout_version, &task->dst_tgts,
task->dst_rebuild_op, task->dst_retry_rebuild_op,
task->dst_retry_map_ver, task->dst_stop_admin,
task, delay_sec);
rc = ds_rebuild_schedule(
pool, task->dst_map_ver, rgt->rgt_stable_epoch,
task->dst_new_layout_version, &task->dst_tgts, task->dst_rebuild_op,
task->dst_rebuild_cause, task->dst_retry_rebuild_op,
task->dst_retry_map_ver, task->dst_stop_admin, task, delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc,
DF_RB ": errno " DF_RC ", schedule retry %u(%s)", DP_RB_RGT(rgt),
DP_RC(rgt->rgt_status.rs_errno), task->dst_rebuild_op,
Expand All @@ -1920,7 +1927,7 @@
rc = ds_rebuild_schedule(
pool, task->dst_reclaim_ver - 1 /* map_ver */,
rgt->rgt_stable_epoch, task->dst_new_layout_version,
&task->dst_tgts, RB_OP_FAIL_RECLAIM,
&task->dst_tgts, RB_OP_FAIL_RECLAIM, task->dst_rebuild_cause,
task->dst_rebuild_op /* retry_rebuild_op */,
task->dst_map_ver /* retry_map_ver */, rgt->rgt_stop_admin,
task, delay_sec);
Expand All @@ -1938,8 +1945,9 @@
rc = ds_rebuild_schedule(
pool, task->dst_reclaim_ver - 1 /* map_ver */, rgt->rgt_stable_epoch,
task->dst_new_layout_version, &task->dst_tgts, RB_OP_FAIL_RECLAIM,
retry_opc /* retry_rebuild_op */, task->dst_map_ver /* retry_map_ver */,
rgt->rgt_stop_admin, task, delay_sec);
task->dst_rebuild_cause, retry_opc /* retry_rebuild_op */,
task->dst_map_ver /* retry_map_ver */, rgt->rgt_stop_admin, task,
delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc,
DF_RB ": errno " DF_RC ", schedule %u(%s)", DP_RB_RGT(rgt),
DP_RC(rgt->rgt_status.rs_errno), RB_OP_FAIL_RECLAIM,
Expand All @@ -1955,27 +1963,50 @@
if (retry_opc == RB_OP_NONE)
D_GOTO(complete, rc);

rc = ds_rebuild_schedule(
pool, task->dst_map_ver, rgt->rgt_stable_epoch, task->dst_new_layout_version,
&task->dst_tgts, retry_opc /* rebuild_op*/, 0 /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec);
rc = ds_rebuild_schedule(pool, task->dst_map_ver, rgt->rgt_stable_epoch,
task->dst_new_layout_version, &task->dst_tgts,
retry_opc /* rebuild_op*/, task->dst_rebuild_cause,
0 /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, task, delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc,
DF_RB ": errno " DF_RC ", schedule retry of original %u(%s)",
DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), retry_opc,
RB_OP_STR(retry_opc));
} else if (task->dst_rebuild_op == RB_OP_REBUILD || task->dst_rebuild_op == RB_OP_UPGRADE) {
/* Schedule reclaim for successful op:Rebuild
* (exclude/drain/reintegrate/extend/upgrade). */
rgt->rgt_status.rs_state = DRS_IN_PROGRESS;

obj_reclaim_ver = obj_reclaim_ver > 0 ? obj_reclaim_ver : task->dst_map_ver;
rc = ds_rebuild_schedule(
pool, obj_reclaim_ver, rgt->rgt_reclaim_epoch, task->dst_new_layout_version,
&task->dst_tgts, RB_OP_RECLAIM, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, DF_RB ": errno " DF_RC ", schedule %u(%s)",
DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), RB_OP_RECLAIM,
RB_OP_STR(RB_OP_RECLAIM));
/*
* Schedule reclaim for a successful op:Rebuild
* (exclude/drain/reintegrate/extend/upgrade).
*
* Optimisation: if the rebuild was triggered solely by an

Check failure on line 1980 in src/rebuild/srv.c

View workflow job for this annotation

GitHub Actions / Codespell

Optimisation ==> Optimization
* exclude operation, the excluded target is gone permanently
* and its data has already been migrated; there are no stale
* copies left to reclaim. Skip scheduling RB_OP_RECLAIM.
*
* All other causes (drain, reintegrate, extend) still require
* a follow-up RECLAIM. If dst_rebuild_cause is 0 (unknown,
* e.g. regenerated after a leader switch) we conservatively
* schedule RECLAIM.
*
* RB_OP_UPGRADE always needs RECLAIM regardless of cause.
*/
if (task->dst_rebuild_op == RB_OP_UPGRADE || task->dst_rebuild_cause == 0 ||
(task->dst_rebuild_cause & ~RB_CAUSE_NO_RECLAIM) != 0) {
rgt->rgt_status.rs_state = DRS_IN_PROGRESS;

obj_reclaim_ver = obj_reclaim_ver > 0 ? obj_reclaim_ver : task->dst_map_ver;
rc = ds_rebuild_schedule(
pool, obj_reclaim_ver, rgt->rgt_reclaim_epoch,
task->dst_new_layout_version, &task->dst_tgts, RB_OP_RECLAIM,
task->dst_rebuild_cause, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */, task, delay_sec);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc,
DF_RB ": errno " DF_RC ", schedule %u(%s)", DP_RB_RGT(rgt),
DP_RC(rgt->rgt_status.rs_errno), RB_OP_RECLAIM,
RB_OP_STR(RB_OP_RECLAIM));
} else {
D_INFO(DF_RB ": skip RECLAIM (cause=0x%x, exclude only)\n", DP_RB_RGT(rgt),
task->dst_rebuild_cause);
}
}

complete:
Expand Down Expand Up @@ -2011,9 +2042,9 @@
rc = ds_rebuild_schedule(
pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch,
task->dst_new_layout_version, &task->dst_tgts,
task->dst_retry_rebuild_op, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */, task,
-1 /* delay_sec */);
task->dst_retry_rebuild_op, task->dst_rebuild_cause,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, task, -1 /* delay_sec */);
DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc,
DF_RB ": errno " DF_RC ", schedule retry %u(%s) with delay -1",
DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno),
Expand All @@ -2025,7 +2056,7 @@
/* Fail_reclaim done (and a stop command wasn't received during) - retry rebuild. */
rc1 = ds_rebuild_schedule(pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch,
task->dst_new_layout_version, &task->dst_tgts,
task->dst_retry_rebuild_op,
task->dst_retry_rebuild_op, task->dst_rebuild_cause,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, task, delay_sec);
DL_CDEBUG(rc1, DLOG_ERR, DLOG_INFO, rc,
Expand Down Expand Up @@ -2465,8 +2496,9 @@
int
ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t reclaim_eph,
uint32_t layout_version, struct pool_target_id_list *tgts,
daos_rebuild_opc_t rebuild_op, daos_rebuild_opc_t retry_rebuild_op,
uint32_t retry_map_ver, bool stop_admin, void *cur_taskp, uint64_t delay_sec)
daos_rebuild_opc_t rebuild_op, uint32_t rebuild_cause,
daos_rebuild_opc_t retry_rebuild_op, uint32_t retry_map_ver, bool stop_admin,
void *cur_taskp, uint64_t delay_sec)
{
struct rebuild_task *cur_task = cur_taskp;
struct rebuild_task *new_task;
Expand All @@ -2492,7 +2524,8 @@
if (tgts != NULL && tgts->pti_number > 0 &&
rebuild_op != RB_OP_RECLAIM && rebuild_op != RB_OP_FAIL_RECLAIM) {
/* Check if the pool already in the queue list */
rc = rebuild_try_merge_tgts(pool, map_ver, rebuild_op, tgts, delay_sec);
rc = rebuild_try_merge_tgts(pool, map_ver, rebuild_op, rebuild_cause, tgts,
delay_sec);
if (rc)
return rc == 1 ? 0 : rc;
}
Expand All @@ -2514,6 +2547,16 @@
new_task->dst_rebuild_op = rebuild_op;
new_task->dst_reclaim_eph = reclaim_eph;
new_task->dst_new_layout_version = layout_version;
/*
* Propagate or initialise the rebuild cause bitmask.

Check failure on line 2551 in src/rebuild/srv.c

View workflow job for this annotation

GitHub Actions / Codespell

initialise ==> initialize
* For derived operations (RECLAIM, FAIL_RECLAIM, retry of REBUILD) we
* carry the cause forward from the originating task so that the
* RECLAIM-skip optimisation can still be applied correctly.

Check failure on line 2554 in src/rebuild/srv.c

View workflow job for this annotation

GitHub Actions / Codespell

optimisation ==> optimization
*/
if (cur_task != NULL)
new_task->dst_rebuild_cause = cur_task->dst_rebuild_cause | rebuild_cause;
else
new_task->dst_rebuild_cause = rebuild_cause;
if (rebuild_op == RB_OP_FAIL_RECLAIM) {
new_task->dst_retry_map_ver = retry_map_ver;
new_task->dst_retry_rebuild_op = retry_rebuild_op;
Expand Down Expand Up @@ -2663,18 +2706,18 @@
id_list.pti_number = 1;

if (tgt->ta_comp.co_status & (PO_COMP_ST_DOWN | PO_COMP_ST_DRAIN)) {
rc = ds_rebuild_schedule(pool, tgt->ta_comp.co_fseq,
current_eph == 0 ? eph : current_eph, 0, &id_list,
RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */,
NULL /* cur_taskp */, delay);
rc = ds_rebuild_schedule(
pool, tgt->ta_comp.co_fseq, current_eph == 0 ? eph : current_eph, 0,
&id_list, RB_OP_REBUILD, 0 /* cause unknown after leader switch */,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, NULL /* cur_taskp */, delay);
} else {
D_ASSERT(tgt->ta_comp.co_status == PO_COMP_ST_UP);
rc = ds_rebuild_schedule(pool, tgt->ta_comp.co_in_ver,
current_eph == 0 ? eph : current_eph, 0, &id_list,
RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */,
0 /* retry_map_ver */, false /* stop_admin */,
NULL /* cur_taskp */, delay);
rc = ds_rebuild_schedule(
pool, tgt->ta_comp.co_in_ver, current_eph == 0 ? eph : current_eph, 0,
&id_list, RB_OP_REBUILD, 0 /* cause unknown after leader switch */,
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
false /* stop_admin */, NULL /* cur_taskp */, delay);
}
if (rc) {
D_ERROR(DF_UUID" schedule ver %d failed: "DF_RC"\n",
Expand Down
Loading