Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16927 pool: Store server handles in RDBs #15846

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/include/daos/pool.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -82,7 +83,7 @@
* Version 1 corresponds to 2.2 (aggregation optimizations)
* Version 2 corresponds to 2.4 (dynamic evtree, checksum scrubbing)
* Version 3 corresponds to 2.6 (root embedded values, pool service operations tracking KVS)
* Version 4 corresponds to 2.8 (SV gang allocation)
* Version 4 corresponds to 2.8 (SV gang allocation, server pool/cont hdls)
*/
#define DAOS_POOL_GLOBAL_VERSION 4

Expand Down
3 changes: 3 additions & 0 deletions src/pool/srv_layout.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2017-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -31,6 +32,8 @@ RDB_STRING_KEY(ds_pool_prop_, svc_ops_enabled);
RDB_STRING_KEY(ds_pool_prop_, svc_ops_max);
RDB_STRING_KEY(ds_pool_prop_, svc_ops_num);
RDB_STRING_KEY(ds_pool_prop_, svc_ops_age);
RDB_STRING_KEY(ds_pool_prop_, srv_handle);
RDB_STRING_KEY(ds_pool_prop_, srv_cont_handle);

/** pool handle KVS */
RDB_STRING_KEY(ds_pool_prop_, handles);
Expand Down
3 changes: 3 additions & 0 deletions src/pool/srv_layout.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -84,6 +85,8 @@ extern d_iov_t ds_pool_prop_svc_ops_enabled; /* uint32_t */
extern d_iov_t ds_pool_prop_svc_ops_max; /* uint32_t */
extern d_iov_t ds_pool_prop_svc_ops_num; /* uint32_t */
extern d_iov_t ds_pool_prop_svc_ops_age; /* uint32_t */
extern d_iov_t ds_pool_prop_srv_handle; /* uuid_t */
extern d_iov_t ds_pool_prop_srv_cont_handle; /* uuid_t */
/* Please read the IMPORTANT notes above before adding new keys. */

/*
Expand Down
163 changes: 134 additions & 29 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#define DAOS_POOL_GLOBAL_VERSION_WITH_HDL_CRED 1
#define DAOS_POOL_GLOBAL_VERSION_WITH_SVC_OPS_KVS 3
#define DAOS_POOL_GLOBAL_VERSION_WITH_DATA_THRESH 3
#define DAOS_POOL_GLOBAL_VERSION_WITH_SRV_HDLS 4

#define PS_OPS_PER_SEC 4096

Expand Down Expand Up @@ -253,7 +254,8 @@ static int
pool_space_query_bcast(crt_context_t ctx, struct pool_svc *svc, uuid_t pool_hdl,
struct daos_pool_space *ps, uint64_t *mem_file_bytes);
static int ds_pool_upgrade_if_needed(uuid_t pool_uuid, struct rsvc_hint *po_hint,
struct pool_svc *svc, crt_rpc_t *rpc);
struct pool_svc *svc, crt_rpc_t *rpc, uuid_t srv_pool_hdl,
uuid_t srv_cont_hdl);
static int
find_hdls_to_evict(struct rdb_tx *tx, struct pool_svc *svc, uuid_t **hdl_uuids,
size_t *hdl_uuids_size, int *n_hdl_uuids, char *machine);
Expand Down Expand Up @@ -774,6 +776,7 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co
uint64_t rdb_size;
int rc;
struct daos_prop_entry *entry;
uuid_t uuid;

rc = gen_pool_buf(NULL /* map */, &map_buf, map_version, ndomains, nnodes, ntargets,
domains, dss_tgt_nr);
Expand Down Expand Up @@ -902,8 +905,22 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co
}
d_iov_set(&value, &svc_ops_num, sizeof(svc_ops_num));
rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_num, &value);
if (rc != 0)
if (rc != 0) {
DL_ERROR(rc, "failed to set svc_ops_num");
goto out_map_buf;
}

d_iov_set(&value, uuid, sizeof(uuid_t));
uuid_generate(uuid);
rc = rdb_tx_update(tx, kvs, &ds_pool_prop_srv_handle, &value);
if (rc != 0) {
DL_ERROR(rc, "failed to write server pool handle");
goto out_map_buf;
}
uuid_generate(uuid);
rc = rdb_tx_update(tx, kvs, &ds_pool_prop_srv_cont_handle, &value);
if (rc != 0)
DL_ERROR(rc, "failed to write server container handle");

out_map_buf:
pool_buf_free(map_buf);
Expand Down Expand Up @@ -1953,7 +1970,8 @@ ds_pool_svc_load(struct rdb_tx *tx, uuid_t uuid, rdb_path_t *root, uint32_t *glo
*/
static int
read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out,
uint32_t *map_version_out, daos_prop_t **prop_out)
uint32_t *map_version_out, daos_prop_t **prop_out, uuid_t srv_pool_hdl,
uuid_t srv_cont_hdl)
{
struct rdb_tx tx;
d_iov_t value;
Expand Down Expand Up @@ -1998,31 +2016,31 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out,
/* Check if duplicate operations detection is enabled, for informative debug log */
rc = rdb_get_size(svc->ps_rsvc.s_db, &rdb_size);
if (rc != 0)
goto out_lock;
goto out_map_buf;
rdb_size_ok = (rdb_size >= DUP_OP_MIN_RDB_SIZE);

d_iov_set(&value, &svc->ps_ops_enabled, sizeof(svc->ps_ops_enabled));
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_svc_ops_enabled, &value);
if (rc != 0) {
D_ERROR(DF_UUID ": failed to lookup svc_ops_enabled: " DF_RC "\n",
DP_UUID(svc->ps_uuid), DP_RC(rc));
goto out_lock;
goto out_map_buf;
}

d_iov_set(&value, &svc->ps_ops_age, sizeof(svc->ps_ops_age));
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_svc_ops_age, &value);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_age",
DP_UUID(svc->ps_uuid));
goto out_lock;
goto out_map_buf;
}

d_iov_set(&value, &svc->ps_ops_max, sizeof(svc->ps_ops_max));
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_svc_ops_max, &value);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_max",
DP_UUID(svc->ps_uuid));
goto out_lock;
goto out_map_buf;
}

D_DEBUG(DB_MD,
Expand All @@ -2039,6 +2057,36 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out,
DP_UUID(svc->ps_uuid));
}

if (svc->ps_global_version >= DAOS_POOL_GLOBAL_VERSION_WITH_SRV_HDLS) {
d_iov_set(&value, srv_pool_hdl, sizeof(uuid_t));
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_srv_handle, &value);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to look up server pool handle",
DP_UUID(svc->ps_uuid));
goto out_map_buf;
}
if (uuid_is_null(srv_pool_hdl)) {
D_ERROR(DF_UUID ": null server pool handle\n", DP_UUID(svc->ps_uuid));
rc = -DER_IO;
goto out_map_buf;
}
d_iov_set(&value, srv_cont_hdl, sizeof(uuid_t));
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_srv_cont_handle, &value);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to look up server container handle",
DP_UUID(svc->ps_uuid));
goto out_map_buf;
}
if (uuid_is_null(srv_cont_hdl)) {
D_ERROR(DF_UUID ": null server container handle\n", DP_UUID(svc->ps_uuid));
rc = -DER_IO;
goto out_map_buf;
}
} else {
uuid_clear(srv_pool_hdl);
uuid_clear(srv_cont_hdl);
}

D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc));
*map_buf_out = map_buf;
*map_version_out = map_version;
Expand Down Expand Up @@ -2278,8 +2326,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
struct pool_svc *svc = pool_svc_obj(rsvc);
struct pool_buf *map_buf = NULL;
uint32_t map_version = 0;
uuid_t pool_hdl_uuid;
uuid_t cont_hdl_uuid;
uuid_t srv_pool_hdl;
uuid_t srv_cont_hdl;
daos_prop_t *prop = NULL;
bool cont_svc_up = false;
bool events_initialized = false;
Expand All @@ -2302,7 +2350,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
if (!primary_group_initialized())
return -DER_GRPVER;

rc = read_db_for_stepping_up(svc, &map_buf, &map_version, &prop);
rc = read_db_for_stepping_up(svc, &map_buf, &map_version, &prop, srv_pool_hdl,
srv_cont_hdl);
if (rc != 0)
goto out;

Expand Down Expand Up @@ -2358,27 +2407,33 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
goto out;
}

if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
uuid_copy(pool_hdl_uuid, svc->ps_pool->sp_srv_pool_hdl);
uuid_copy(cont_hdl_uuid, svc->ps_pool->sp_srv_cont_hdl);
if (svc->ps_global_version >= DAOS_POOL_GLOBAL_VERSION_WITH_SRV_HDLS) {
/* See the is_pool_from_srv comment in the "else" branch. */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm bit worried about this "delay the sp_srv_cont_hdl initialization in ds_pool_iv_refresh_hdl()", what if an IV or whatever request reached leader and called ds_cont_hdl_rdb_lookup() before the sp_srv_cont_hdl is initialized?

I also don't quite see why we need to compare the request handle with this global sp_srv_cont_hdl in ds_cont_hdl_rdb_lookup(), this function is only called by cont_iv_ent_fetch() which is for IV_CONT_CAPA, probably because we mistakenly regard global handle as a normal open handle when fetching IV_CONT_CAPA? (see cont_iv_hdl_fetch(), it's called by common request handler and doesn't distinguish global handle from normal handle).

Clearing the global handles in pool_iv_ent_invalid() looks not safe to me too, it looks to me this IV invalidate function could be called when IV request failed? Given that global handles are immutable now, I'm wondering if we still need to invalidate them?

Just raise some concerns here, they could be addressed in a separate ticket.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NiuYawei, agreed. When writing this patch, this IV code was the most difficult part to investigate. I tried to 1) maintain the current algorithm for the "2.8 code hosting 2.6 pool" case, and 2) avoid making big changes to code I don't confidently understand.

One change I'm confident about is that we need a new IV operation semantics: Update/invalidate an IV on the local node always synchronously, while update/invalidate it synchronously or asynchronously based on flags. Then, this trick here would no longer be necessary. Such semantics would also benefit other cases on a PS leader.

In 3.0, we will be able to drop the current algorithm because 2.6 pools will no longer be supported. We can gradually remove the complexities during (or even before) the 3.0 development.

if (uuid_is_null(svc->ps_pool->sp_srv_pool_hdl))
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, srv_pool_hdl);
} else {
uuid_generate(pool_hdl_uuid);
uuid_generate(cont_hdl_uuid);
/* Only copy server handle to make is_from_srv() check correctly, and
* container server handle will not be copied here, otherwise
* ds_pool_iv_refresh_hdl will not open the server container handle.
*/
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid);
if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
uuid_copy(srv_pool_hdl, svc->ps_pool->sp_srv_pool_hdl);
uuid_copy(srv_cont_hdl, svc->ps_pool->sp_srv_cont_hdl);
} else {
uuid_generate(srv_pool_hdl);
uuid_generate(srv_cont_hdl);
/* Only copy server handle to make is_pool_from_srv() check correctly, and
* container server handle will not be copied here, otherwise
* ds_pool_iv_refresh_hdl will not open the server container handle.
*/
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, srv_pool_hdl);
}
}

rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid);
rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, srv_pool_hdl, srv_cont_hdl);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

/* resume pool upgrade if needed */
rc = ds_pool_upgrade_if_needed(svc->ps_uuid, NULL, svc, NULL);
rc = ds_pool_upgrade_if_needed(svc->ps_uuid, NULL, svc, NULL, srv_pool_hdl, srv_cont_hdl);
if (rc != 0)
goto out;

Expand All @@ -2395,7 +2450,7 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)

DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64": srv_pool_hdl="
DF_UUID" srv_cont_hdl="DF_UUID"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_UUID(pool_hdl_uuid), DP_UUID(cont_hdl_uuid));
svc->ps_rsvc.s_term, DP_UUID(srv_pool_hdl), DP_UUID(srv_cont_hdl));
out:
if (rc != 0) {
if (events_initialized)
Expand Down Expand Up @@ -5583,11 +5638,13 @@ pool_upgrade_one_prop_int32(struct rdb_tx *tx, struct pool_svc *svc, uuid_t uuid
}

static int
pool_upgrade_props(struct rdb_tx *tx, struct pool_svc *svc, uuid_t pool_uuid, crt_rpc_t *rpc)
pool_upgrade_props(struct rdb_tx *tx, struct pool_svc *svc, uuid_t pool_uuid, crt_rpc_t *rpc,
uuid_t srv_pool_hdl, uuid_t srv_cont_hdl)
{
d_iov_t value;
uint64_t val;
uint32_t val32;
uuid_t valuuid;
int rc;
bool need_commit = false;
uuid_t *hdl_uuids = NULL;
Expand Down Expand Up @@ -5912,6 +5969,54 @@ pool_upgrade_props(struct rdb_tx *tx, struct pool_svc *svc, uuid_t pool_uuid, cr
need_commit = true;
}

/*
* Initialize server pool and container handles in the DB. To be conservative, we require
* the old server pool and container handles to be initialized already in memory, and use
* their existing values instead of generating new UUIDs.
*/
d_iov_set(&value, valuuid, sizeof(uuid_t));
rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_srv_handle, &value);
if (rc && rc != -DER_NONEXIST) {
D_GOTO(out_free, rc);
} else if (rc == -DER_NONEXIST) {
if (srv_pool_hdl != NULL && !uuid_is_null(srv_pool_hdl)) {
uuid_copy(valuuid, srv_pool_hdl);
} else if (!uuid_is_null(svc->ps_pool->sp_srv_pool_hdl)) {
uuid_copy(valuuid, svc->ps_pool->sp_srv_pool_hdl);
} else {
D_ERROR(DF_UUID ": server pool handle unavailable\n", DP_UUID(pool_uuid));
D_GOTO(out_free, rc);
}
rc = rdb_tx_update(tx, &svc->ps_root, &ds_pool_prop_srv_handle, &value);
if (rc) {
DL_ERROR(rc, DF_UUID ": failed to upgrade server pool handle",
DP_UUID(pool_uuid));
D_GOTO(out_free, rc);
}
need_commit = true;
}
rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_srv_cont_handle, &value);
if (rc && rc != -DER_NONEXIST) {
D_GOTO(out_free, rc);
} else if (rc == -DER_NONEXIST) {
if (srv_cont_hdl != NULL && !uuid_is_null(srv_cont_hdl)) {
uuid_copy(valuuid, srv_cont_hdl);
} else if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
uuid_copy(valuuid, svc->ps_pool->sp_srv_cont_hdl);
} else {
D_ERROR(DF_UUID ": server container handle unavailable\n",
DP_UUID(pool_uuid));
D_GOTO(out_free, rc);
}
rc = rdb_tx_update(tx, &svc->ps_root, &ds_pool_prop_srv_cont_handle, &value);
if (rc) {
DL_ERROR(rc, DF_UUID ": failed to upgrade server container handle",
DP_UUID(pool_uuid));
D_GOTO(out_free, rc);
}
need_commit = true;
}

D_DEBUG(DB_MD, DF_UUID ": need_commit=%s\n", DP_UUID(pool_uuid),
need_commit ? "true" : "false");
if (need_commit) {
Expand Down Expand Up @@ -6112,8 +6217,8 @@ ds_pool_mark_upgrade_completed(uuid_t pool_uuid, int ret)
}

static int
ds_pool_upgrade_if_needed(uuid_t pool_uuid, struct rsvc_hint *po_hint,
struct pool_svc *svc, crt_rpc_t *rpc)
ds_pool_upgrade_if_needed(uuid_t pool_uuid, struct rsvc_hint *po_hint, struct pool_svc *svc,
crt_rpc_t *rpc, uuid_t srv_pool_hdl, uuid_t srv_cont_hdl)
{
struct rdb_tx tx;
d_iov_t value;
Expand Down Expand Up @@ -6229,7 +6334,7 @@ ds_pool_upgrade_if_needed(uuid_t pool_uuid, struct rsvc_hint *po_hint,
/**
* Todo: make sure no rebuild/reint/expand are in progress
*/
rc = pool_upgrade_props(&tx, svc, pool_uuid, rpc);
rc = pool_upgrade_props(&tx, svc, pool_uuid, rpc, srv_pool_hdl, srv_cont_hdl);
if (rc)
D_GOTO(out_tx, rc);

Expand Down Expand Up @@ -6270,8 +6375,8 @@ ds_pool_upgrade_handler(crt_rpc_t *rpc)
struct pool_upgrade_out *out = crt_reply_get(rpc);
int rc;

rc = ds_pool_upgrade_if_needed(in->poi_op.pi_uuid,
&out->poo_op.po_hint, NULL, rpc);
rc = ds_pool_upgrade_if_needed(in->poi_op.pi_uuid, &out->poo_op.po_hint, NULL, rpc, NULL,
NULL);
out->poo_op.po_rc = rc;
D_DEBUG(DB_MD, DF_UUID ": replying rpc: %p %d\n", DP_UUID(in->poi_op.pi_uuid), rpc, rc);
crt_reply_send(rpc);
Expand Down
Loading