From abdc15ddbd99f331a7c15a218b918ea4b261ba45 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 14 Mar 2026 14:49:26 +0000 Subject: [PATCH 1/3] DAOS-18541 rebuild: reduce redundant migration OID RPCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix yield-count accounting in the scanner: rebuild_object() is a pure in-memory btree insert and does not need to contribute yield pressure. A send-side batching policy is also introduced: the send ULT defers flushing until at least REBUILD_SEND_BATCH_MIN OIDs are queued or REBUILD_SEND_BATCH_TIMEOUT_SEC seconds have elapsed, preventing a flood of small migrate RPCs when the scanner runs faster than the sender — particularly under reintegration workloads. Signed-off-by: Wang Shilong --- src/rebuild/rebuild_internal.h | 4 ++- src/rebuild/scan.c | 59 +++++++++++++++++++++++++++++++--- src/rebuild/srv.c | 1 + 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index c00b7d90517..52673ab54e2 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -282,6 +282,8 @@ struct rebuild_pool_tls { unsigned int rebuild_pool_ver; uint32_t rebuild_pool_gen; uint64_t rebuild_pool_leader_term; + uint64_t rebuild_send_wait_start; + uint64_t rebuild_pool_obj_send_pending; int rebuild_pool_status; unsigned int rebuild_pool_scanning:1, rebuild_pool_scan_done:1; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 61f8d86680c..6183b138a62 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -30,8 +30,14 @@ #include "rebuild_internal.h" #define REBUILD_SEND_LIMIT 4096 +/* Minimum pending objects before the send ULT flushes a batch (25% of max). */ +#define REBUILD_SEND_BATCH_MIN (REBUILD_SEND_LIMIT / 4) +/* Maximum seconds to wait for a batch to fill before flushing anyway. */ +#define REBUILD_SEND_BATCH_TIMEOUT_SEC 2 + struct rebuild_send_arg { struct rebuild_tgt_pool_tracker *rpt; + struct rebuild_pool_tls *tls; daos_unit_oid_t *oids; daos_epoch_t *ephs; daos_epoch_t *punched_ephs; @@ -77,6 +83,10 @@ rebuild_obj_fill_buf(daos_handle_t ih, d_iov_t *key_iov, if (rc != 0) return rc; + /* This OID is now removed from the btree; account for it. */ + D_ASSERT(arg->tls->rebuild_pool_obj_send_pending > 0); + arg->tls->rebuild_pool_obj_send_pending--; + /* re-probe the dbtree after delete */ rc = dbtree_iter_probe(ih, BTR_PROBE_FIRST, DAOS_INTENT_MIGRATION, NULL, NULL); @@ -300,17 +310,56 @@ rebuild_objects_send_ult(void *data) arg.ephs = ephs; arg.punched_ephs = punched_ephs; arg.rpt = rpt; + arg.tls = tls; + + tls->rebuild_send_wait_start = daos_gettime_coarse(); + + /* + * Batch OIDs before sending migrate RPCs to avoid RPC fragmentation. + * The scan ULT yields every ~SCAN_YIELD_CNT placement-cost units + * (1 per OID for small objects, up to more than 128 per OID for EC_16P3GX + * depends on cluster size), so without batching the send ULT would flush + * only ~128 OIDs per RPC instead of the REBUILD_SEND_LIMIT maximum. + * Hold the flush until REBUILD_SEND_BATCH_MIN OIDs are pending or + * REBUILD_SEND_BATCH_TIMEOUT_SEC seconds have elapsed; flush immediately + * when the scan is done. + */ while (!tls->rebuild_pool_scan_done || !dbtree_is_empty(tls->rebuild_tree_hdl)) { + bool scan_done; + bool tree_empty; + uint64_t now; + uint64_t elapsed; + if (rpt->rt_stable_epoch == 0) { dss_sleep(0); continue; } - if (dbtree_is_empty(tls->rebuild_tree_hdl)) { + tree_empty = dbtree_is_empty(tls->rebuild_tree_hdl); + scan_done = tls->rebuild_pool_scan_done; + + if (tree_empty) { + /* Reset wait clock and yield to let scan make progress. */ + tls->rebuild_send_wait_start = daos_gettime_coarse(); dss_sleep(0); continue; } + now = daos_gettime_coarse(); + elapsed = now - tls->rebuild_send_wait_start; + + if (!scan_done && tls->rebuild_pool_obj_send_pending < REBUILD_SEND_BATCH_MIN && + elapsed < REBUILD_SEND_BATCH_TIMEOUT_SEC) { + dss_sleep(10); + continue; + } + + D_DEBUG(DB_REBUILD, + DF_RB " send batch: pending %" PRIu64 " elapsed %" PRIu64 "s" + " scan_done %d\n", + DP_RB_RPT(rpt), tls->rebuild_pool_obj_send_pending, elapsed, + (int)scan_done); + /* walk through the rebuild tree and send the rebuild objects */ rc = dbtree_iterate(tls->rebuild_tree_hdl, DAOS_INTENT_MIGRATION, false, rebuild_cont_iter_cb, &arg); @@ -318,6 +367,8 @@ rebuild_objects_send_ult(void *data) DL_ERROR(rc, DF_RB " dbtree iterate failed", DP_RB_RPT(rpt)); break; } + + tls->rebuild_send_wait_start = now; dss_sleep(0); } @@ -389,6 +440,8 @@ rebuild_object_insert(struct rebuild_tgt_pool_tracker *rpt, uuid_t co_uuid, DP_UUID(co_uuid), DP_UOID(oid), tgt_id); rc = 0; } else { + if (rc == 0) + tls->rebuild_pool_obj_send_pending++; D_DEBUG(DB_REBUILD, "insert "DF_UOID"/"DF_UUID" tgt %u "DF_U64"/"DF_U64": " DF_RC"\n", DP_UOID(oid), DP_UUID(co_uuid), tgt_id, epoch, punched_epoch, DP_RC(rc)); @@ -779,8 +832,6 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, rc = rebuild_object(rpt, arg->co_uuid, oid, tgts[i], shards[i], myrank, ent); if (rc) D_GOTO(out, rc); - - arg->yield_cnt--; } out: @@ -793,7 +844,7 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, if (map != NULL) pl_map_decref(map); - if (--arg->yield_cnt <= 0) { + if (arg->yield_cnt <= 0) { D_DEBUG(DB_REBUILD, DF_RB " rebuild yield: %d\n", DP_RB_RPT(rpt), rc); arg->yield_cnt = SCAN_YIELD_CNT; if (rc == 0) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 91185bb5cc3..5b1e43ca771 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -129,6 +129,7 @@ rebuild_pool_tls_create(struct rebuild_tgt_pool_tracker *rpt) rebuild_pool_tls->rebuild_pool_scanning = 1; rebuild_pool_tls->rebuild_pool_scan_done = 0; rebuild_pool_tls->rebuild_pool_obj_count = 0; + rebuild_pool_tls->rebuild_pool_obj_send_pending = 0; rebuild_pool_tls->rebuild_pool_reclaim_obj_count = 0; rebuild_pool_tls->rebuild_tree_hdl = DAOS_HDL_INVAL; /* Only 1 thread will access the list, no need lock */ From 63b8f93e6ff8ada5a1917f9f6ee5f0370a582858 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 16 Mar 2026 00:52:14 +0000 Subject: [PATCH 2/3] Address comments Signed-off-by: Wang Shilong --- src/rebuild/scan.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 6183b138a62..aad8fe78ae0 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -33,7 +33,7 @@ /* Minimum pending objects before the send ULT flushes a batch (25% of max). */ #define REBUILD_SEND_BATCH_MIN (REBUILD_SEND_LIMIT / 4) /* Maximum seconds to wait for a batch to fill before flushing anyway. */ -#define REBUILD_SEND_BATCH_TIMEOUT_SEC 2 +#define REBUILD_SEND_BATCH_TIMEOUT_SEC 1 struct rebuild_send_arg { struct rebuild_tgt_pool_tracker *rpt; @@ -319,7 +319,7 @@ rebuild_objects_send_ult(void *data) * The scan ULT yields every ~SCAN_YIELD_CNT placement-cost units * (1 per OID for small objects, up to more than 128 per OID for EC_16P3GX * depends on cluster size), so without batching the send ULT would flush - * only ~128 OIDs per RPC instead of the REBUILD_SEND_LIMIT maximum. + * at most 64 OIDs per RPC instead of the REBUILD_SEND_LIMIT maximum. * Hold the flush until REBUILD_SEND_BATCH_MIN OIDs are pending or * REBUILD_SEND_BATCH_TIMEOUT_SEC seconds have elapsed; flush immediately * when the scan is done. @@ -337,17 +337,16 @@ rebuild_objects_send_ult(void *data) tree_empty = dbtree_is_empty(tls->rebuild_tree_hdl); scan_done = tls->rebuild_pool_scan_done; + now = daos_gettime_coarse(); if (tree_empty) { /* Reset wait clock and yield to let scan make progress. */ - tls->rebuild_send_wait_start = daos_gettime_coarse(); - dss_sleep(0); + tls->rebuild_send_wait_start = now; + dss_sleep(10); continue; } - now = daos_gettime_coarse(); elapsed = now - tls->rebuild_send_wait_start; - if (!scan_done && tls->rebuild_pool_obj_send_pending < REBUILD_SEND_BATCH_MIN && elapsed < REBUILD_SEND_BATCH_TIMEOUT_SEC) { dss_sleep(10); @@ -832,6 +831,8 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, rc = rebuild_object(rpt, arg->co_uuid, oid, tgts[i], shards[i], myrank, ent); if (rc) D_GOTO(out, rc); + + arg->yield_cnt--; } out: From f4154c1f8071b3562ba7957d7990da05ef8a37d2 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 16 Mar 2026 06:20:22 +0000 Subject: [PATCH 3/3] cleanup codes Signed-off-by: Wang Shilong --- src/rebuild/rebuild_internal.h | 3 +-- src/rebuild/scan.c | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 52673ab54e2..e1f8c44eda7 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -281,8 +281,7 @@ struct rebuild_pool_tls { uint64_t rebuild_pool_reclaim_obj_count; unsigned int rebuild_pool_ver; uint32_t rebuild_pool_gen; - uint64_t rebuild_pool_leader_term; - uint64_t rebuild_send_wait_start; + uint64_t rebuild_pool_leader_term; uint64_t rebuild_pool_obj_send_pending; int rebuild_pool_status; unsigned int rebuild_pool_scanning:1, diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index aad8fe78ae0..d55820ccb29 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -283,6 +283,7 @@ rebuild_objects_send_ult(void *data) daos_epoch_t *punched_ephs = NULL; unsigned int *shards = NULL; int rc = 0; + uint64_t rebuild_send_wait_start; tls = rebuild_pool_tls_lookup(rpt->rt_pool_uuid, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); @@ -312,7 +313,7 @@ rebuild_objects_send_ult(void *data) arg.rpt = rpt; arg.tls = tls; - tls->rebuild_send_wait_start = daos_gettime_coarse(); + rebuild_send_wait_start = daos_gettime_coarse(); /* * Batch OIDs before sending migrate RPCs to avoid RPC fragmentation. @@ -341,12 +342,12 @@ rebuild_objects_send_ult(void *data) if (tree_empty) { /* Reset wait clock and yield to let scan make progress. */ - tls->rebuild_send_wait_start = now; + rebuild_send_wait_start = now; dss_sleep(10); continue; } - elapsed = now - tls->rebuild_send_wait_start; + elapsed = now - rebuild_send_wait_start; if (!scan_done && tls->rebuild_pool_obj_send_pending < REBUILD_SEND_BATCH_MIN && elapsed < REBUILD_SEND_BATCH_TIMEOUT_SEC) { dss_sleep(10); @@ -367,7 +368,7 @@ rebuild_objects_send_ult(void *data) break; } - tls->rebuild_send_wait_start = now; + rebuild_send_wait_start = now; dss_sleep(0); }