From e14d571b1b0d96fefef42759c4f81d31d33df2b4 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 21 Feb 2026 17:45:52 +0800 Subject: [PATCH] DAOS-18544 object: reserved targets for GX object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current DAOS only reserves RF × group_size targets for a GX object, whereas it should reserve targets_per_domain × RF. In addition, when the number of reserved targets is a fixed value, the likelihood of losing those targets grows significantly as the cluster scales, which can cause collocated shards and extra data movement during rebuild. This patch changes the number of reserved targets for a GX object to be no less than 30% of the targets. Signed-off-by: Liang Zhen --- src/object/obj_class.c | 59 +++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/object/obj_class.c b/src/object/obj_class.c index 654a316efe0..6824e931d38 100644 --- a/src/object/obj_class.c +++ b/src/object/obj_class.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -18,6 +18,9 @@ static struct daos_obj_class **oc_resil_array; static int oc_ident_array_sz; static int oc_resil_array_sz; +#define D_GX_RESERVED_ENV "D_GX_RESERVED" +static unsigned int oc_gx_reserved; + static struct daos_obj_class *oclass_ident2cl(daos_oclass_id_t oc_id, uint32_t *nr_grps); static struct daos_obj_class *oclass_resil2cl(struct daos_oclass_attr *ca); @@ -253,17 +256,48 @@ daos_oclass_grp_nr(struct daos_oclass_attr *oc_attr, struct daos_obj_md *md) /** * To honor RF setting during failure cases, let's reserve RF - * groups, so if some targets fail, there will be enough replacement + * domains, so if some targets fail, there will be enough replacement * targets to rebuild, so to avoid putting multiple shards in the same * domain, which may break the RF setting. * - * Though let's keep reserve targets to be less than 30% of the total - * targets. + * Though let's keep reserve targets to be no less than 20% on small deployment (20 domains), + * or 30% on large deployment, because otherwise layout computation will be too expensive. */ +#define DOM_LARGE 20 + static uint32_t -reserve_grp_by_rf(uint32_t target_nr, uint32_t grp_size, uint32_t rf) +reserve_grp_by_rf(uint32_t domain_nr, uint32_t target_nr, uint32_t grp_size, uint32_t rf) { - return min(((target_nr * 3) / 10) / grp_size, rf); + unsigned tgt_per_dom; + unsigned rsv_small; + unsigned rsv_large; + unsigned rsv_tgts; + + D_ASSERT(target_nr >= domain_nr && domain_nr >= 1); /* unless pool map is corrupted */ + if (oc_gx_reserved > 0) { + rsv_tgts = min(oc_gx_reserved, domain_nr - 1); + goto out; + } + + tgt_per_dom = target_nr / domain_nr; + + rsv_small = (target_nr * 2) / 10; /* 20 percent for small deployment */ + rsv_large = (target_nr * 3) / 10; /* 30 percent for large deployment */ + + if (domain_nr < DOM_LARGE) { /* small deployment */ + rsv_tgts = rsv_small; + } else { + /* ensure object layout is not smaller than the case of (dom_nr < DOM_LARGE) */ + if (DOM_LARGE * tgt_per_dom - rsv_small > target_nr - rsv_large) + rsv_tgts = rsv_small; + else + rsv_tgts = rsv_large; + } + + if (rsv_tgts < tgt_per_dom * rf) + rsv_tgts = tgt_per_dom * rf; +out: + return (rsv_tgts + grp_size - 1) / grp_size; } int @@ -303,7 +337,7 @@ daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, enum d grp_size = daos_oclass_grp_size(&ca); if (ca.ca_grp_nr == DAOS_OBJ_GRP_MAX) { - uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf_factor); + uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf_factor); ca.ca_grp_nr = max(1, (target_nr / grp_size)); @@ -865,7 +899,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype grp_nr = max(256, target_nr * 50 / 100); break; case DAOS_OCH_SHD_EXT: - grp_nr = max(1024, target_nr * 80 / 100); + grp_nr = max(1024, target_nr * 70 / 100); break; default: D_ERROR("Invalid sharding hint\n"); @@ -874,7 +908,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype if (grp_nr == DAOS_OBJ_GRP_MAX || grp_nr * grp_size > target_nr) { uint32_t max_grp = target_nr / grp_size; - uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf); + uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf); /* search for the highest scalability in the allowed range */ if (max_grp > reserve_grp) @@ -948,6 +982,13 @@ obj_class_init(void) if (oc_ident_array) return 0; + oc_gx_reserved = 0; + d_getenv_uint(D_GX_RESERVED_ENV, &oc_gx_reserved); + if (oc_gx_reserved > 0) { + D_INFO("%s = %u, setting it has global impact on all pools!\n", D_GX_RESERVED_ENV, + oc_gx_reserved); + } + D_ALLOC_ARRAY(oc_ident_array, OC_NR); if (!oc_ident_array) return -DER_NOMEM;