Skip to content

Commit

Permalink
DAOS-16936 dtx: rank range based DTX hints for coll_punch
Browse files Browse the repository at this point in the history
When handle collective punch RPC, the leader will generate hints
array for related DTX RPC (abort or commit). Each involved engine
will have one element in such array. To simplify RPC logic, such
hints array is sparse and indexed with engine's rank#. Originally,
we did not properly handled the case of incontinuous rank# in the
pool map, as to when update some hints element with large rank#,
the write maybe out of boundary and crash the others' space and
cause kinds of DRAM corruption.

Similar situation can happen during handle collective DTX resync
and cleanup.

This patch fixes such issue via building such sparse hints array
based on related engines' rank# range instead of the ranks count
in the pool map. Use relative ranks diff (real rank# - base rank#)
as the index. That will avoid out of boundary access.

Signed-off-by: Fan Yong <[email protected]>
  • Loading branch information
Nasf-Fan committed Feb 26, 2025
1 parent f9ecc96 commit 15e44d6
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 44 deletions.
95 changes: 77 additions & 18 deletions src/dtx/dtx_coll.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2023-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -93,7 +94,7 @@ dtx_coll_prep_ult(void *arg)
dci->dci_version, cont->sc_pool->spc_map_version,
opc == DTX_COLL_CHECK, false, &dcpa->dcpa_dce);
if (dcpa->dcpa_result != 0)
D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX "
D_ERROR("Failed to prepare the bitmap for collective DTX "
DF_DTI" opc %u: "DF_RC"\n", DP_DTI(&dci->dci_xid), opc,
DP_RC(dcpa->dcpa_result));

Expand All @@ -106,6 +107,43 @@ dtx_coll_prep_ult(void *arg)
D_ASSERT(rc == ABT_SUCCESS);
}

static void
dtx_coll_find_rank_range(struct pl_map *map, struct pl_obj_layout *layout, uint32_t version,
uint32_t *min_rank, uint32_t *max_rank)
{
struct pool_target *target;
d_rank_t my_rank = dss_self_rank();
int rc;
int i;

for (i = 0, *min_rank = -1, *max_rank = 0; i < layout->ol_nr; i++) {
if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
continue;

rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target);
D_ASSERT(rc == 1);

/* Skip current leader rank. */
if (target->ta_comp.co_rank == my_rank)
continue;

/* Skip the target that (re-)joined the system after the DTX. */
if (target->ta_comp.co_ver > version)
continue;

/* Skip non-healthy one. */
if (target->ta_comp.co_status != PO_COMP_ST_UP &&
target->ta_comp.co_status != PO_COMP_ST_UPIN &&
target->ta_comp.co_status != PO_COMP_ST_DRAIN)
continue;

if (*min_rank > target->ta_comp.co_rank)
*min_rank = target->ta_comp.co_rank;
if (*max_rank < target->ta_comp.co_rank)
*max_rank = target->ta_comp.co_rank;
}
}

int
dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs,
uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint,
Expand All @@ -118,6 +156,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
struct dtx_coll_target *dct;
struct dtx_coll_entry *dce = NULL;
struct daos_obj_md md = { 0 };
uint32_t *ranks;
uint32_t rank_nr;
d_rank_t my_rank = dss_self_rank();
d_rank_t max_rank = 0;
Expand Down Expand Up @@ -198,7 +237,33 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
}
}

rank_nr = pool_map_rank_nr(map->pl_poolmap);
md.omd_id = oid.id_pub;
md.omd_ver = pm_ver;
md.omd_fdom_lvl = dct->dct_fdom_lvl;
md.omd_pda = dct->dct_pda;
md.omd_pdom_lvl = dct->dct_pdom_lvl;

rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout);
if (rc != 0) {
D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n",
DP_OID(oid.id_pub), DP_UUID(po_uuid));
goto out;
}

if (likely(mbs->dm_flags & DMF_RANK_RANGE)) {
ranks = dtx_coll_mbs_rankrange(mbs);
dce->dce_min_rank = ranks[0];
dce->dce_max_rank = ranks[1];
} else {
/*
* Only for handling the existing old collective DTX entry, relative rare case.
* So it is no matter to double scan the object layout.
*/
dtx_coll_find_rank_range(map, layout, dtx_ver,
&dce->dce_min_rank, &dce->dce_max_rank);
}

rank_nr = dce->dce_max_rank - dce->dce_min_rank + 1;
if (unlikely(rank_nr == 1))
D_GOTO(out, rc = 0);

Expand All @@ -213,26 +278,18 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
for (i = 0; i < rank_nr; i++)
dce->dce_hints[i] = (uint8_t)(-1);

md.omd_id = oid.id_pub;
md.omd_ver = pm_ver;
md.omd_fdom_lvl = dct->dct_fdom_lvl;
md.omd_pda = dct->dct_pda;
md.omd_pdom_lvl = dct->dct_pdom_lvl;

rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout);
if (rc != 0) {
D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n",
DP_OID(oid.id_pub), DP_UUID(po_uuid));
goto out;
}

for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) {
if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
continue;

rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target);
D_ASSERT(rc == 1);

/* Skip the one out of rank range. */
if (target->ta_comp.co_rank < dce->dce_min_rank ||
target->ta_comp.co_rank > dce->dce_max_rank)
continue;

/* Skip current leader rank. */
if (target->ta_comp.co_rank == my_rank)
continue;
Expand All @@ -247,8 +304,9 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
target->ta_comp.co_status != PO_COMP_ST_DRAIN)
continue;

if (dce->dce_hints[target->ta_comp.co_rank] == (uint8_t)(-1)) {
dce->dce_hints[target->ta_comp.co_rank] = target->ta_comp.co_index;
if (dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] == (uint8_t)(-1)) {
dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] =
target->ta_comp.co_index;
dce->dce_ranks->rl_ranks[j++] = target->ta_comp.co_rank;
if (max_rank < target->ta_comp.co_rank)
max_rank = target->ta_comp.co_rank;
Expand All @@ -268,7 +326,8 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
dce->dce_hint_sz = 0;
} else {
dce->dce_ranks->rl_nr = j;
dce->dce_hint_sz = max_rank + 1;
dce->dce_max_rank = max_rank;
dce->dce_hint_sz = dce->dce_max_rank - dce->dce_min_rank + 1;
}

out:
Expand Down
3 changes: 3 additions & 0 deletions src/dtx/dtx_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -81,6 +82,8 @@ CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX);
((uuid_t) (dci_co_uuid) CRT_VAR) \
((struct dtx_id) (dci_xid) CRT_VAR) \
((uint32_t) (dci_version) CRT_VAR) \
((uint32_t) (dci_min_rank) CRT_VAR) \
((uint32_t) (dci_max_rank) CRT_VAR) \
((uint32_t) (dci_padding) CRT_VAR) \
((uint64_t) (dci_epoch) CRT_VAR) \
((uint8_t) (dci_hints) CRT_ARRAY)
Expand Down
7 changes: 7 additions & 0 deletions src/dtx/dtx_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1474,6 +1475,8 @@ struct dtx_coll_rpc_args {
struct dtx_id dcra_xid;
uint32_t dcra_opc;
uint32_t dcra_ver;
uint32_t dcra_min_rank;
uint32_t dcra_max_rank;
daos_epoch_t dcra_epoch;
d_rank_list_t *dcra_ranks;
uint8_t *dcra_hints;
Expand Down Expand Up @@ -1530,6 +1533,8 @@ dtx_coll_rpc(struct dtx_coll_rpc_args *dcra)
uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid);
dci->dci_xid = dcra->dcra_xid;
dci->dci_version = dcra->dcra_ver;
dci->dci_min_rank = dcra->dcra_min_rank;
dci->dci_max_rank = dcra->dcra_max_rank;
dci->dci_epoch = dcra->dcra_epoch;
dci->dci_hints.ca_count = dcra->dcra_hint_sz;
dci->dci_hints.ca_arrays = dcra->dcra_hints;
Expand Down Expand Up @@ -1575,6 +1580,8 @@ dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_coll_entry *dce, uint32
dcra->dcra_xid = dce->dce_xid;
dcra->dcra_opc = opc;
dcra->dcra_ver = dce->dce_ver;
dcra->dcra_min_rank = dce->dce_min_rank;
dcra->dcra_max_rank = dce->dce_max_rank;
dcra->dcra_epoch = epoch;
dcra->dcra_ranks = dce->dce_ranks;
dcra->dcra_hints = dce->dce_hints;
Expand Down
22 changes: 18 additions & 4 deletions src/dtx/dtx_srv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -370,10 +371,21 @@ dtx_coll_handler(crt_rpc_t *rpc)
int i;

D_ASSERT(hints != NULL);
D_ASSERT(dci->dci_hints.ca_count > myrank);

if (unlikely(dci->dci_hints.ca_count != dci->dci_max_rank - dci->dci_min_rank + 1)) {
D_ERROR("On-wire data corruption: hints_cnt %u, max_rank %u, min_rank %u\n",
(uint32_t)dci->dci_hints.ca_count, dci->dci_max_rank, dci->dci_min_rank);
D_GOTO(out, rc = -DER_INVAL);
}

if (unlikely(myrank < dci->dci_min_rank || myrank > dci->dci_max_rank)) {
D_ERROR("On-wire data corruption: myrank %u, max_rank %u, min_rank %u\n",
myrank, dci->dci_max_rank, dci->dci_min_rank);
D_GOTO(out, rc = -DER_INVAL);
}

D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI" with hint %d\n",
opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank]);
opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank - dci->dci_min_rank]);

dcpa.dcpa_rpc = rpc;
rc = ABT_future_create(1, NULL, &dcpa.dcpa_future);
Expand All @@ -382,10 +394,12 @@ dtx_coll_handler(crt_rpc_t *rpc)
D_GOTO(out, rc = dss_abterr2der(rc));
}

rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank], 0, NULL);
rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS,
hints[myrank - dci->dci_min_rank], 0, NULL);
if (rc != 0) {
ABT_future_free(&dcpa.dcpa_future);
D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc));
D_ERROR("Failed to create ult on XS %u: "DF_RC"\n",
hints[myrank - dci->dci_min_rank], DP_RC(rc));
goto out;
}

Expand Down
26 changes: 26 additions & 0 deletions src/include/daos/dtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ enum dtx_mbs_flags {
DMF_SORTED_SAD_IDX = (1 << 3),
/* The dtx target information are organized as dtx_coll_target. */
DMF_COLL_TARGET = (1 << 4),
/*
* The range for the ranks [min, max] on which some object shards reside.
* It is usually used for collective DTX and appended after the bitmap in
* the MBS data.
*/
DMF_RANK_RANGE = (1 << 5),
};

/**
Expand Down Expand Up @@ -255,6 +261,26 @@ daos_dti_equal(struct dtx_id *dti0, struct dtx_id *dti1)
return memcmp(dti0, dti1, sizeof(*dti0)) == 0;
}

static inline uint32_t *
dtx_coll_mbs_rankrange(struct dtx_memberships *mbs)
{
struct dtx_daos_target *ddt;
struct dtx_coll_target *dct;
size_t size;

D_ASSERT(mbs->dm_flags & DMF_COLL_TARGET);
D_ASSERT(mbs->dm_flags & DMF_RANK_RANGE);

ddt = &mbs->dm_tgts[0];
dct = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt);

size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*dct) +
sizeof(dct->dct_tgts[0]) * dct->dct_tgt_nr + dct->dct_bitmap_sz;
size = (size + 3) & ~3;

return (uint32_t *)((void *)ddt + size);
}

#define DF_DTI DF_UUID"."DF_X64
#define DF_DTIF DF_UUIDF"."DF_X64
#define DP_DTI(dti) DP_UUID((dti)->dti_uuid), (dti)->dti_hlc
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos_srv/dtx_srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ struct dtx_sub_status {
struct dtx_coll_entry {
struct dtx_id dce_xid;
uint32_t dce_ver;
uint32_t dce_min_rank;
uint32_t dce_max_rank;
uint32_t dce_refs;
d_rank_list_t *dce_ranks;
uint8_t *dce_hints;
Expand Down
8 changes: 6 additions & 2 deletions src/object/obj_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1090,7 +1091,8 @@ static int
crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op,
struct obj_dtx_mbs *odm)
{
int rc;
uint32_t size;
int rc;

rc = crt_proc_struct_dtx_id(proc, proc_op, &odm->odm_xid);
if (unlikely(rc))
Expand All @@ -1104,7 +1106,9 @@ crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op,
if (unlikely(rc))
return rc;

return crt_proc_struct_dtx_mbs(proc, proc_op, odm->odm_mbs_max_sz, &odm->odm_mbs);
/* For collective DTX, rank range will be appended after the bitmap in MBS data. */
size = ((odm->odm_mbs_max_sz + 3) & ~3) + sizeof(uint32_t) * 2;
return crt_proc_struct_dtx_mbs(proc, proc_op, size, &odm->odm_mbs);
}

static int
Expand Down
Loading

0 comments on commit 15e44d6

Please sign in to comment.