Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16936 dtx: rank range based DTX hints for coll_punch #15979

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 76 additions & 17 deletions src/dtx/dtx_coll.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2023-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -106,6 +107,43 @@ dtx_coll_prep_ult(void *arg)
D_ASSERT(rc == ABT_SUCCESS);
}

static void
dtx_coll_find_rank_range(struct pl_map *map, struct pl_obj_layout *layout, uint32_t version,
uint32_t *min_rank, uint32_t *max_rank)
{
struct pool_target *target;
d_rank_t my_rank = dss_self_rank();
int rc;
int i;

for (i = 0, *min_rank = -1, *max_rank = 0; i < layout->ol_nr; i++) {
if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
continue;

rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target);
D_ASSERT(rc == 1);

/* Skip current leader rank. */
if (target->ta_comp.co_rank == my_rank)
continue;

/* Skip the target that (re-)joined the system after the DTX. */
if (target->ta_comp.co_ver > version)
continue;

/* Skip non-healthy one. */
if (target->ta_comp.co_status != PO_COMP_ST_UP &&
target->ta_comp.co_status != PO_COMP_ST_UPIN &&
target->ta_comp.co_status != PO_COMP_ST_DRAIN)
continue;

if (*min_rank > target->ta_comp.co_rank)
*min_rank = target->ta_comp.co_rank;
if (*max_rank < target->ta_comp.co_rank)
*max_rank = target->ta_comp.co_rank;
}
}

int
dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs,
uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint,
Expand All @@ -118,6 +156,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
struct dtx_coll_target *dct;
struct dtx_coll_entry *dce = NULL;
struct daos_obj_md md = { 0 };
uint32_t *ranks;
uint32_t rank_nr;
d_rank_t my_rank = dss_self_rank();
d_rank_t max_rank = 0;
Expand Down Expand Up @@ -198,7 +237,33 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
}
}

rank_nr = pool_map_rank_nr(map->pl_poolmap);
md.omd_id = oid.id_pub;
md.omd_ver = pm_ver;
md.omd_fdom_lvl = dct->dct_fdom_lvl;
md.omd_pda = dct->dct_pda;
md.omd_pdom_lvl = dct->dct_pdom_lvl;

rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout);
if (rc != 0) {
D_ERROR("Failed to load object layout for " DF_OID " in pool " DF_UUID "\n",
DP_OID(oid.id_pub), DP_UUID(po_uuid));
goto out;
}

if (likely(mbs->dm_flags & DMF_RANK_RANGE)) {
ranks = dtx_coll_mbs_rankrange(mbs);
dce->dce_min_rank = ranks[0];
dce->dce_max_rank = ranks[1];
} else {
/*
* Only for handling the existing old collective DTX entry, relative rare case.
* So it is no matter to double scan the object layout.
*/
dtx_coll_find_rank_range(map, layout, dtx_ver, &dce->dce_min_rank,
&dce->dce_max_rank);
}

rank_nr = dce->dce_max_rank - dce->dce_min_rank + 1;
if (unlikely(rank_nr == 1))
D_GOTO(out, rc = 0);

Expand All @@ -213,26 +278,18 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
for (i = 0; i < rank_nr; i++)
dce->dce_hints[i] = (uint8_t)(-1);

md.omd_id = oid.id_pub;
md.omd_ver = pm_ver;
md.omd_fdom_lvl = dct->dct_fdom_lvl;
md.omd_pda = dct->dct_pda;
md.omd_pdom_lvl = dct->dct_pdom_lvl;

rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout);
if (rc != 0) {
D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n",
DP_OID(oid.id_pub), DP_UUID(po_uuid));
goto out;
}

for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) {
if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
continue;

rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target);
D_ASSERT(rc == 1);

/* Skip the one out of rank range. */
if (target->ta_comp.co_rank < dce->dce_min_rank ||
target->ta_comp.co_rank > dce->dce_max_rank)
continue;

/* Skip current leader rank. */
if (target->ta_comp.co_rank == my_rank)
continue;
Expand All @@ -247,8 +304,9 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
target->ta_comp.co_status != PO_COMP_ST_DRAIN)
continue;

if (dce->dce_hints[target->ta_comp.co_rank] == (uint8_t)(-1)) {
dce->dce_hints[target->ta_comp.co_rank] = target->ta_comp.co_index;
if (dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] == (uint8_t)(-1)) {
dce->dce_hints[target->ta_comp.co_rank - dce->dce_min_rank] =
target->ta_comp.co_index;
dce->dce_ranks->rl_ranks[j++] = target->ta_comp.co_rank;
if (max_rank < target->ta_comp.co_rank)
max_rank = target->ta_comp.co_rank;
Expand All @@ -268,7 +326,8 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
dce->dce_hint_sz = 0;
} else {
dce->dce_ranks->rl_nr = j;
dce->dce_hint_sz = max_rank + 1;
dce->dce_max_rank = max_rank;
dce->dce_hint_sz = dce->dce_max_rank - dce->dce_min_rank + 1;
}

out:
Expand Down
5 changes: 5 additions & 0 deletions src/dtx/dtx_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -76,14 +77,18 @@ CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX);
* dci_hints is sparse array, one per engine, sorted against the rank ID.
* It can hold more than 19K engines inline RPC body.
*/
/* clang-format off */
#define DAOS_ISEQ_COLL_DTX \
((uuid_t) (dci_po_uuid) CRT_VAR) \
((uuid_t) (dci_co_uuid) CRT_VAR) \
((struct dtx_id) (dci_xid) CRT_VAR) \
((uint32_t) (dci_version) CRT_VAR) \
((uint32_t) (dci_min_rank) CRT_VAR) \
((uint32_t) (dci_max_rank) CRT_VAR) \
((uint32_t) (dci_padding) CRT_VAR) \
((uint64_t) (dci_epoch) CRT_VAR) \
((uint8_t) (dci_hints) CRT_ARRAY)
/* clang-format on */

/* DTX collective RPC output fields */
#define DAOS_OSEQ_COLL_DTX \
Expand Down
7 changes: 7 additions & 0 deletions src/dtx/dtx_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1474,6 +1475,8 @@ struct dtx_coll_rpc_args {
struct dtx_id dcra_xid;
uint32_t dcra_opc;
uint32_t dcra_ver;
uint32_t dcra_min_rank;
uint32_t dcra_max_rank;
daos_epoch_t dcra_epoch;
d_rank_list_t *dcra_ranks;
uint8_t *dcra_hints;
Expand Down Expand Up @@ -1530,6 +1533,8 @@ dtx_coll_rpc(struct dtx_coll_rpc_args *dcra)
uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid);
dci->dci_xid = dcra->dcra_xid;
dci->dci_version = dcra->dcra_ver;
dci->dci_min_rank = dcra->dcra_min_rank;
dci->dci_max_rank = dcra->dcra_max_rank;
dci->dci_epoch = dcra->dcra_epoch;
dci->dci_hints.ca_count = dcra->dcra_hint_sz;
dci->dci_hints.ca_arrays = dcra->dcra_hints;
Expand Down Expand Up @@ -1575,6 +1580,8 @@ dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_coll_entry *dce, uint32
dcra->dcra_xid = dce->dce_xid;
dcra->dcra_opc = opc;
dcra->dcra_ver = dce->dce_ver;
dcra->dcra_min_rank = dce->dce_min_rank;
dcra->dcra_max_rank = dce->dce_max_rank;
dcra->dcra_epoch = epoch;
dcra->dcra_ranks = dce->dce_ranks;
dcra->dcra_hints = dce->dce_hints;
Expand Down
24 changes: 19 additions & 5 deletions src/dtx/dtx_srv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -370,10 +371,21 @@ dtx_coll_handler(crt_rpc_t *rpc)
int i;

D_ASSERT(hints != NULL);
D_ASSERT(dci->dci_hints.ca_count > myrank);

D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI" with hint %d\n",
opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank]);
if (unlikely(dci->dci_hints.ca_count != dci->dci_max_rank - dci->dci_min_rank + 1)) {
D_ERROR("On-wire data corruption: hints_cnt %u, max_rank %u, min_rank %u\n",
(uint32_t)dci->dci_hints.ca_count, dci->dci_max_rank, dci->dci_min_rank);
D_GOTO(out, rc = -DER_INVAL);
}

if (unlikely(myrank < dci->dci_min_rank || myrank > dci->dci_max_rank)) {
D_ERROR("On-wire data corruption: myrank %u, max_rank %u, min_rank %u\n", myrank,
dci->dci_max_rank, dci->dci_min_rank);
D_GOTO(out, rc = -DER_INVAL);
}

D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for " DF_DTI " with hint %d\n",
opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank - dci->dci_min_rank]);

dcpa.dcpa_rpc = rpc;
rc = ABT_future_create(1, NULL, &dcpa.dcpa_future);
Expand All @@ -382,10 +394,12 @@ dtx_coll_handler(crt_rpc_t *rpc)
D_GOTO(out, rc = dss_abterr2der(rc));
}

rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank], 0, NULL);
rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank - dci->dci_min_rank],
0, NULL);
if (rc != 0) {
ABT_future_free(&dcpa.dcpa_future);
D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc));
D_ERROR("Failed to create ult on XS %u: " DF_RC "\n",
hints[myrank - dci->dci_min_rank], DP_RC(rc));
goto out;
}

Expand Down
36 changes: 31 additions & 5 deletions src/include/daos/dtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,31 @@ enum dtx_mbs_flags {
/* The targets being modified via the DTX belong to a replicated
* object within single redundancy group.
*/
DMF_SRDG_REP = (1 << 0),
DMF_SRDG_REP = (1 << 0),
/* The MBS contains the DTX leader information, usually used for
* distributed transaction. In old release (before 2.4), for some
* stand-alone modification, leader information may be not stored
* inside MBS as optimization.
*/
DMF_CONTAIN_LEADER = (1 << 1),
DMF_CONTAIN_LEADER = (1 << 1),
/* The dtx_memberships::dm_tgts is sorted against target ID. Obsolete. */
DMF_SORTED_TGT_ID = (1 << 2),
DMF_SORTED_TGT_ID = (1 << 2),
/* The dtx_memberships::dm_tgts is sorted against shard index.
* For most of cases, shard index matches the shard ID. But during
* shard migration, there may be some temporary shards in related
* object layout. Under such case, related shard ID is not unique
* in the object layout, but the shard index is unique. So we use
* shard index to sort the dtx_memberships::dm_tgts. Obsolete.
*/
DMF_SORTED_SAD_IDX = (1 << 3),
DMF_SORTED_SAD_IDX = (1 << 3),
/* The dtx target information are organized as dtx_coll_target. */
DMF_COLL_TARGET = (1 << 4),
DMF_COLL_TARGET = (1 << 4),
/*
* The range for the ranks [min, max] on which some object shards reside.
* It is usually used for collective DTX and appended after the bitmap in
* the MBS data.
*/
DMF_RANK_RANGE = (1 << 5),
};

/**
Expand Down Expand Up @@ -255,6 +261,26 @@ daos_dti_equal(struct dtx_id *dti0, struct dtx_id *dti1)
return memcmp(dti0, dti1, sizeof(*dti0)) == 0;
}

static inline uint32_t *
dtx_coll_mbs_rankrange(struct dtx_memberships *mbs)
{
struct dtx_daos_target *ddt;
struct dtx_coll_target *dct;
size_t size;

D_ASSERT(mbs->dm_flags & DMF_COLL_TARGET);
D_ASSERT(mbs->dm_flags & DMF_RANK_RANGE);

ddt = &mbs->dm_tgts[0];
dct = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt);

size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*dct) +
sizeof(dct->dct_tgts[0]) * dct->dct_tgt_nr + dct->dct_bitmap_sz;
size = (size + 3) & ~3;

return (uint32_t *)((void *)ddt + size);
}

#define DF_DTI DF_UUID"."DF_X64
#define DF_DTIF DF_UUIDF"."DF_X64
#define DP_DTI(dti) DP_UUID((dti)->dti_uuid), (dti)->dti_hlc
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos_srv/dtx_srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ struct dtx_sub_status {
struct dtx_coll_entry {
struct dtx_id dce_xid;
uint32_t dce_ver;
uint32_t dce_min_rank;
uint32_t dce_max_rank;
uint32_t dce_refs;
d_rank_list_t *dce_ranks;
uint8_t *dce_hints;
Expand Down
8 changes: 6 additions & 2 deletions src/object/obj_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1090,7 +1091,8 @@ static int
crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op,
struct obj_dtx_mbs *odm)
{
int rc;
uint32_t size;
int rc;

rc = crt_proc_struct_dtx_id(proc, proc_op, &odm->odm_xid);
if (unlikely(rc))
Expand All @@ -1104,7 +1106,9 @@ crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op,
if (unlikely(rc))
return rc;

return crt_proc_struct_dtx_mbs(proc, proc_op, odm->odm_mbs_max_sz, &odm->odm_mbs);
/* For collective DTX, rank range will be appended after the bitmap in MBS data. */
size = ((odm->odm_mbs_max_sz + 3) & ~3) + sizeof(uint32_t) * 2;
return crt_proc_struct_dtx_mbs(proc, proc_op, size, &odm->odm_mbs);
}

static int
Expand Down
Loading