diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index da4e64b04d0..106205e0c5b 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -106,6 +107,43 @@ dtx_coll_prep_ult(void *arg) D_ASSERT(rc == ABT_SUCCESS); } +static void +dtx_coll_find_rank_range(struct pl_map *map, struct pl_obj_layout *layout, uint32_t version, + uint32_t *min_rank, uint32_t *max_rank) +{ + struct pool_target *target; + d_rank_t my_rank = dss_self_rank(); + int rc; + int i; + + for (i = 0, *min_rank = -1, *max_rank = 0; i < layout->ol_nr; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target); + D_ASSERT(rc == 1); + + /* Skip current leader rank. */ + if (target->ta_comp.co_rank == my_rank) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > version) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + if (*min_rank > target->ta_comp.co_rank) + *min_rank = target->ta_comp.co_rank; + if (*max_rank < target->ta_comp.co_rank) + *max_rank = target->ta_comp.co_rank; + } +} + int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, @@ -118,6 +156,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt struct dtx_coll_target *dct; struct dtx_coll_entry *dce = NULL; struct daos_obj_md md = { 0 }; + uint32_t *ranks; uint32_t rank_nr; d_rank_t my_rank = dss_self_rank(); d_rank_t max_rank = 0; @@ -198,7 +237,33 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt } } - rank_nr = pool_map_rank_nr(map->pl_poolmap); + md.omd_id = oid.id_pub; + md.omd_ver = pm_ver; + md.omd_fdom_lvl = dct->dct_fdom_lvl; + md.omd_pda = dct->dct_pda; + md.omd_pdom_lvl = dct->dct_pdom_lvl; + + rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for " DF_OID " in pool " DF_UUID "\n", + DP_OID(oid.id_pub), DP_UUID(po_uuid)); + goto out; + } + + if (likely(mbs->dm_flags & DMF_RANK_RANGE)) { + ranks = dtx_coll_mbs_rankrange(mbs); + dce->dce_min_rank = ranks[0]; + dce->dce_max_rank = ranks[1]; + } else { + /* + * Only for handling the existing old collective DTX entry, relative rare case. + * So it is no matter to double scan the object layout. + */ + dtx_coll_find_rank_range(map, layout, dtx_ver, &dce->dce_min_rank, + &dce->dce_max_rank); + } + + rank_nr = dce->dce_max_rank - dce->dce_min_rank + 1; if (unlikely(rank_nr == 1)) D_GOTO(out, rc = 0); @@ -213,19 +278,6 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt for (i = 0; i < rank_nr; i++) dce->dce_hints[i] = (uint8_t)(-1); - md.omd_id = oid.id_pub; - md.omd_ver = pm_ver; - md.omd_fdom_lvl = dct->dct_fdom_lvl; - md.omd_pda = dct->dct_pda; - md.omd_pdom_lvl = dct->dct_pdom_lvl; - - rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); - if (rc != 0) { - D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", - DP_OID(oid.id_pub), DP_UUID(po_uuid)); - goto out; - } - for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) { if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) continue; @@ -233,6 +285,11 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target); D_ASSERT(rc == 1); + /* Skip the one out of rank range. */ + if (target->ta_comp.co_rank < dce->dce_min_rank || + target->ta_comp.co_rank > dce->dce_max_rank) + continue; + /* Skip current leader rank. */ if (target->ta_comp.co_rank == my_rank) continue; @@ -247,8 +304,9 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt target->ta_comp.co_status != PO_COMP_ST_DRAIN) continue; - if (dce->dce_hints[target->ta_comp.co_rank] == (uint8_t)(-1)) { - dce->dce_hints[target->ta_comp.co_rank] = target->ta_comp.co_index; + if (dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] == (uint8_t)(-1)) { + dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] = + target->ta_comp.co_index; dce->dce_ranks->rl_ranks[j++] = target->ta_comp.co_rank; if (max_rank < target->ta_comp.co_rank) max_rank = target->ta_comp.co_rank; @@ -268,7 +326,8 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt dce->dce_hint_sz = 0; } else { dce->dce_ranks->rl_nr = j; - dce->dce_hint_sz = max_rank + 1; + dce->dce_max_rank = max_rank; + dce->dce_hint_sz = dce->dce_max_rank - dce->dce_min_rank + 1; } out: diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 3fbd06f6b92..e27a8c18135 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -76,14 +77,18 @@ CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); * dci_hints is sparse array, one per engine, sorted against the rank ID. * It can hold more than 19K engines inline RPC body. */ +/* clang-format off */ #define DAOS_ISEQ_COLL_DTX \ ((uuid_t) (dci_po_uuid) CRT_VAR) \ ((uuid_t) (dci_co_uuid) CRT_VAR) \ ((struct dtx_id) (dci_xid) CRT_VAR) \ ((uint32_t) (dci_version) CRT_VAR) \ + ((uint32_t) (dci_min_rank) CRT_VAR) \ + ((uint32_t) (dci_max_rank) CRT_VAR) \ ((uint32_t) (dci_padding) CRT_VAR) \ ((uint64_t) (dci_epoch) CRT_VAR) \ ((uint8_t) (dci_hints) CRT_ARRAY) +/* clang-format on */ /* DTX collective RPC output fields */ #define DAOS_OSEQ_COLL_DTX \ diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 49b972d294d..ecceeb0b563 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1474,6 +1475,8 @@ struct dtx_coll_rpc_args { struct dtx_id dcra_xid; uint32_t dcra_opc; uint32_t dcra_ver; + uint32_t dcra_min_rank; + uint32_t dcra_max_rank; daos_epoch_t dcra_epoch; d_rank_list_t *dcra_ranks; uint8_t *dcra_hints; @@ -1530,6 +1533,8 @@ dtx_coll_rpc(struct dtx_coll_rpc_args *dcra) uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid); dci->dci_xid = dcra->dcra_xid; dci->dci_version = dcra->dcra_ver; + dci->dci_min_rank = dcra->dcra_min_rank; + dci->dci_max_rank = dcra->dcra_max_rank; dci->dci_epoch = dcra->dcra_epoch; dci->dci_hints.ca_count = dcra->dcra_hint_sz; dci->dci_hints.ca_arrays = dcra->dcra_hints; @@ -1575,6 +1580,8 @@ dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_coll_entry *dce, uint32 dcra->dcra_xid = dce->dce_xid; dcra->dcra_opc = opc; dcra->dcra_ver = dce->dce_ver; + dcra->dcra_min_rank = dce->dce_min_rank; + dcra->dcra_max_rank = dce->dce_max_rank; dcra->dcra_epoch = epoch; dcra->dcra_ranks = dce->dce_ranks; dcra->dcra_hints = dce->dce_hints; diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index b1541ba94c8..3466839174d 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -370,10 +371,21 @@ dtx_coll_handler(crt_rpc_t *rpc) int i; D_ASSERT(hints != NULL); - D_ASSERT(dci->dci_hints.ca_count > myrank); - D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI" with hint %d\n", - opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank]); + if (unlikely(dci->dci_hints.ca_count != dci->dci_max_rank - dci->dci_min_rank + 1)) { + D_ERROR("On-wire data corruption: hints_cnt %u, max_rank %u, min_rank %u\n", + (uint32_t)dci->dci_hints.ca_count, dci->dci_max_rank, dci->dci_min_rank); + D_GOTO(out, rc = -DER_INVAL); + } + + if (unlikely(myrank < dci->dci_min_rank || myrank > dci->dci_max_rank)) { + D_ERROR("On-wire data corruption: myrank %u, max_rank %u, min_rank %u\n", myrank, + dci->dci_max_rank, dci->dci_min_rank); + D_GOTO(out, rc = -DER_INVAL); + } + + D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for " DF_DTI " with hint %d\n", + opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank - dci->dci_min_rank]); dcpa.dcpa_rpc = rpc; rc = ABT_future_create(1, NULL, &dcpa.dcpa_future); @@ -382,10 +394,12 @@ dtx_coll_handler(crt_rpc_t *rpc) D_GOTO(out, rc = dss_abterr2der(rc)); } - rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank], 0, NULL); + rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank - dci->dci_min_rank], + 0, NULL); if (rc != 0) { ABT_future_free(&dcpa.dcpa_future); - D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc)); + D_ERROR("Failed to create ult on XS %u: " DF_RC "\n", + hints[myrank - dci->dci_min_rank], DP_RC(rc)); goto out; } diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index 8d28fc5f5f9..76973d81885 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -42,15 +42,15 @@ enum dtx_mbs_flags { /* The targets being modified via the DTX belong to a replicated * object within single redundancy group. */ - DMF_SRDG_REP = (1 << 0), + DMF_SRDG_REP = (1 << 0), /* The MBS contains the DTX leader information, usually used for * distributed transaction. In old release (before 2.4), for some * stand-alone modification, leader information may be not stored * inside MBS as optimization. */ - DMF_CONTAIN_LEADER = (1 << 1), + DMF_CONTAIN_LEADER = (1 << 1), /* The dtx_memberships::dm_tgts is sorted against target ID. Obsolete. */ - DMF_SORTED_TGT_ID = (1 << 2), + DMF_SORTED_TGT_ID = (1 << 2), /* The dtx_memberships::dm_tgts is sorted against shard index. * For most of cases, shard index matches the shard ID. But during * shard migration, there may be some temporary shards in related @@ -58,9 +58,15 @@ enum dtx_mbs_flags { * in the object layout, but the shard index is unique. So we use * shard index to sort the dtx_memberships::dm_tgts. Obsolete. */ - DMF_SORTED_SAD_IDX = (1 << 3), + DMF_SORTED_SAD_IDX = (1 << 3), /* The dtx target information are organized as dtx_coll_target. */ - DMF_COLL_TARGET = (1 << 4), + DMF_COLL_TARGET = (1 << 4), + /* + * The range for the ranks [min, max] on which some object shards reside. + * It is usually used for collective DTX and appended after the bitmap in + * the MBS data. + */ + DMF_RANK_RANGE = (1 << 5), }; /** @@ -255,6 +261,26 @@ daos_dti_equal(struct dtx_id *dti0, struct dtx_id *dti1) return memcmp(dti0, dti1, sizeof(*dti0)) == 0; } +static inline uint32_t * +dtx_coll_mbs_rankrange(struct dtx_memberships *mbs) +{ + struct dtx_daos_target *ddt; + struct dtx_coll_target *dct; + size_t size; + + D_ASSERT(mbs->dm_flags & DMF_COLL_TARGET); + D_ASSERT(mbs->dm_flags & DMF_RANK_RANGE); + + ddt = &mbs->dm_tgts[0]; + dct = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + + size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*dct) + + sizeof(dct->dct_tgts[0]) * dct->dct_tgt_nr + dct->dct_bitmap_sz; + size = (size + 3) & ~3; + + return (uint32_t *)((void *)ddt + size); +} + #define DF_DTI DF_UUID"."DF_X64 #define DF_DTIF DF_UUIDF"."DF_X64 #define DP_DTI(dti) DP_UUID((dti)->dti_uuid), (dti)->dti_hlc diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 34c1a5d8c89..f7962d9b2c9 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -192,6 +192,8 @@ struct dtx_sub_status { struct dtx_coll_entry { struct dtx_id dce_xid; uint32_t dce_ver; + uint32_t dce_min_rank; + uint32_t dce_max_rank; uint32_t dce_refs; d_rank_list_t *dce_ranks; uint8_t *dce_hints; diff --git a/src/object/obj_rpc.c b/src/object/obj_rpc.c index e48c75d5e1d..7bb38e1186a 100644 --- a/src/object/obj_rpc.c +++ b/src/object/obj_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1090,7 +1091,8 @@ static int crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, struct obj_dtx_mbs *odm) { - int rc; + uint32_t size; + int rc; rc = crt_proc_struct_dtx_id(proc, proc_op, &odm->odm_xid); if (unlikely(rc)) @@ -1104,7 +1106,9 @@ crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, if (unlikely(rc)) return rc; - return crt_proc_struct_dtx_mbs(proc, proc_op, odm->odm_mbs_max_sz, &odm->odm_mbs); + /* For collective DTX, rank range will be appended after the bitmap in MBS data. */ + size = ((odm->odm_mbs_max_sz + 3) & ~3) + sizeof(uint32_t) * 2; + return crt_proc_struct_dtx_mbs(proc, proc_op, size, &odm->odm_mbs); } static int diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 5b59f954f86..813b0282ca7 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -233,7 +234,9 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct struct dtx_daos_target *ddt = mbs->dm_tgts; struct dtx_coll_entry *dce = NULL; struct dtx_coll_target *target; - d_rank_t max_rank = 0; + uint32_t *ranks; + uint32_t min_rank = dcts[0].dct_rank; + uint32_t max_rank = dcts[0].dct_rank; uint32_t size; int rc = 0; int i; @@ -254,39 +257,65 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct D_GOTO(out, rc = -DER_INVAL); } - /* Already allocated enough space in MBS when decode to hold the targets and bitmap. */ - target = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + /* For non-leader, the rank range should has already been appended after the bitmap. */ + if (!(ocpi->ocpi_flags & ORF_LEADER)) { + if (unlikely(!(mbs->dm_flags & DMF_RANK_RANGE))) { + D_ERROR("Missed rank range information\n"); + D_GOTO(out, rc = -DER_INVAL); + } - size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*target) + - sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr + dcts[0].dct_bitmap_sz; - if (unlikely(ocpi->ocpi_odm.odm_mbs_max_sz < sizeof(*mbs) + size)) { - D_ERROR("Pre-allocated MBS buffer is too small: %u vs %ld + %u\n", - ocpi->ocpi_odm.odm_mbs_max_sz, sizeof(*mbs), size); - D_GOTO(out, rc = -DER_INVAL); + ranks = dtx_coll_mbs_rankrange(mbs); + min_rank = ranks[0]; + max_rank = ranks[1]; + } else if (dct_nr > 1) { + min_rank = dcts[1].dct_rank; + max_rank = dcts[1].dct_rank; + + for (i = 2; i < dct_nr; i++) { + if (min_rank > dcts[i].dct_rank) + min_rank = dcts[i].dct_rank; + if (max_rank < dcts[i].dct_rank) + max_rank = dcts[i].dct_rank; + } } + /* + * Already allocated enough space in MBS when decode to hold the targets, bitmap, + * and rank range information. Please check crt_proc_struct_dtx_mbs() for detail. + * + * For different DTX participants, the dct_tgt_nr and bitmap size maybe different. + * So each target needs to build each own MBS data: dct + bitmap + rank range. + */ + + target = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); target->dct_tgt_nr = dcts[0].dct_tgt_nr; memcpy(target->dct_tgts, dcts[0].dct_tgt_ids, sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr); target->dct_bitmap_sz = dcts[0].dct_bitmap_sz; memcpy(target->dct_tgts + target->dct_tgt_nr, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); - mbs->dm_data_size = size; D_ALLOC_PTR(dce); if (dce == NULL) D_GOTO(out, rc = -DER_NOMEM); - dce->dce_xid = ocpi->ocpi_xid; - dce->dce_ver = ocpi->ocpi_map_ver; - dce->dce_refs = 1; - D_ALLOC(dce->dce_bitmap, dcts[0].dct_bitmap_sz); if (dce->dce_bitmap == NULL) D_GOTO(out, rc = -DER_NOMEM); + dce->dce_xid = ocpi->ocpi_xid; + dce->dce_ver = ocpi->ocpi_map_ver; + dce->dce_min_rank = min_rank; + dce->dce_max_rank = max_rank; + dce->dce_refs = 1; dce->dce_bitmap_sz = dcts[0].dct_bitmap_sz; memcpy(dce->dce_bitmap, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); + mbs->dm_flags |= DMF_RANK_RANGE; + ranks = dtx_coll_mbs_rankrange(mbs); + ranks[0] = dce->dce_min_rank; + ranks[1] = dce->dce_max_rank; + mbs->dm_data_size = (void *)(ranks + 2) - (void *)ddt; + if (!(ocpi->ocpi_flags & ORF_LEADER) || unlikely(dct_nr <= 1)) D_GOTO(out, rc = 0); @@ -297,8 +326,8 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct D_GOTO(out, rc = -DER_INVAL); } - size = pool_map_rank_nr(map->pl_poolmap); - D_ALLOC_ARRAY(dce->dce_hints, size); + /* The dce_hints maybe sparse array. */ + D_ALLOC_ARRAY(dce->dce_hints, dce->dce_max_rank - dce->dce_min_rank + 1); if (dce->dce_hints == NULL) D_GOTO(out, rc = -DER_NOMEM); @@ -309,8 +338,6 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct /* Set i = 1 to skip leader_rank. */ for (i = 1; i < dct_nr; i++) { dce->dce_ranks->rl_ranks[i - 1] = dcts[i].dct_rank; - if (max_rank < dcts[i].dct_rank) - max_rank = dcts[i].dct_rank; size = dcts[i].dct_bitmap_sz << 3; if (size > dss_tgt_nr) @@ -318,13 +345,13 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct for (j = 0; j < size; j++) { if (isset(dcts[i].dct_bitmap, j)) { - dce->dce_hints[dcts[i].dct_rank] = j; + dce->dce_hints[dcts[i].dct_rank - dce->dce_min_rank] = j; break; } } } - dce->dce_hint_sz = max_rank + 1; + dce->dce_hint_sz = dce->dce_max_rank - dce->dce_min_rank + 1; out: if (map != NULL)