From 0f05b2fd80b43ea45bd2b0fed556b7aa43f01e08 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Sat, 25 Jan 2025 02:02:55 +0900 Subject: [PATCH] DAOS-16930 pool: Share map bulk resources (#15763) Improve concurrent POOL_QUERY, POOL_CONNECT, and POOL_TGT_QUERY_MAP efficiency by giving them a chance to share the same pool map buffer and pool map buffer bulk handle. Introduce pool space query on service leader to avoid space query flooding. The pool space cache expiration time is 2 seconds by default, one can change the expiration time via DAOS_POOL_SPACE_CACHE_INTVL, if the expiration time is set to zero, space cache will be disabled. Signed-off-by: Li Wei Signed-off-by: Niu Yawei Co-authored-by: Xuezhao Liu Co-authored-by: Liang Zhen Co-authored-by: Dalton Bohning --- src/include/daos_srv/pool.h | 10 +- src/pool/srv.c | 10 ++ src/pool/srv_internal.h | 11 +- src/pool/srv_pool.c | 76 +++++++++--- src/pool/srv_target.c | 131 ++++++++++++++++++-- src/pool/srv_util.c | 25 +--- src/tests/ftest/util/server_utils_params.py | 5 + 7 files changed, 212 insertions(+), 56 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 70f5a4083ea..3f7de9f7f1d 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -38,6 +38,13 @@ struct ds_pool_svc; /* age of an entry in svc_ops KVS before it may be evicted */ #define DEFAULT_SVC_OPS_ENTRY_AGE_SEC_MAX 300ULL +/* Pool map buffer cache */ +struct ds_pool_map_bc { + struct pool_buf *pmc_buf; + crt_bulk_t pmc_bulk; + uint32_t pmc_ref; +}; + /* * Pool object * @@ -48,7 +55,8 @@ struct ds_pool { uuid_t sp_uuid; /* pool UUID */ d_list_t sp_hdls; ABT_rwlock sp_lock; - struct pool_map *sp_map; + struct pool_map *sp_map; + struct ds_pool_map_bc *sp_map_bc; uint32_t sp_map_version; /* temporary */ uint32_t sp_ec_cell_sz; uint64_t sp_reclaim; diff --git a/src/pool/srv.c b/src/pool/srv.c index 9c9611c04ef..df93ecf3a5f 100644 --- a/src/pool/srv.c +++ b/src/pool/srv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -22,6 +23,7 @@ bool ec_agg_disabled; uint32_t pw_rf = -1; /* pool wise redundancy factor */ +uint32_t ps_cache_intvl = 2; /* pool space cache expiration time, in seconds */ #define PW_RF_DEFAULT (2) #define PW_RF_MIN (0) #define PW_RF_MAX (4) @@ -76,6 +78,14 @@ init(void) pw_rf = PW_RF_DEFAULT; D_INFO("pool redundancy factor %d\n", pw_rf); + d_getenv_uint32_t("DAOS_POOL_SPACE_CACHE_INTVL", &ps_cache_intvl); + if (ps_cache_intvl > 20) { + D_WARN("pool space cache expiration time %u is too large, use default value\n", + ps_cache_intvl); + ps_cache_intvl = 2; + } + D_INFO("pool space cache expiration time set to %u seconds\n", ps_cache_intvl); + ds_pool_rsvc_class_register(); bio_register_ract_ops(&nvme_reaction_ops); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 612f7760fd1..87eb0cf9c31 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,6 +18,7 @@ #include extern uint32_t pw_rf; +extern uint32_t ps_cache_intvl; /** * Global pool metrics @@ -236,8 +238,10 @@ int ds_pool_tgt_prop_update(struct ds_pool *pool, struct pool_iv_prop *iv_prop); int ds_pool_tgt_connect(struct ds_pool *pool, struct pool_iv_conn *pic); void ds_pool_tgt_query_map_handler(crt_rpc_t *rpc); void ds_pool_tgt_discard_handler(crt_rpc_t *rpc); -void - ds_pool_tgt_warmup_handler(crt_rpc_t *rpc); +void ds_pool_tgt_warmup_handler(crt_rpc_t *rpc); +int ds_pool_lookup_map_bc(struct ds_pool *pool, crt_context_t ctx, + struct ds_pool_map_bc **map_bc_out, uint32_t *map_version_out); +void ds_pool_put_map_bc(struct ds_pool_map_bc *map_bc); /* * srv_util.c @@ -246,8 +250,7 @@ bool ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank); int ds_pool_plan_svc_reconfs(int svc_rf, struct pool_map *map, d_rank_list_t *replicas, d_rank_t self, bool filter_only, d_rank_list_t **to_add_out, d_rank_list_t **to_remove_out); -int ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, - crt_rpc_t *rpc, crt_bulk_t remote_bulk, +int ds_pool_transfer_map_buf(struct ds_pool_map_bc *map_bc, crt_rpc_t *rpc, crt_bulk_t remote_bulk, uint32_t *required_buf_size); extern struct bio_reaction_ops nvme_reaction_ops; diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 066d7bbd88a..a28534c40c0 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -191,6 +191,12 @@ sched_cancel_and_wait(struct pool_svc_sched *sched) sched_wait(sched); } +struct pool_space_cache { + struct daos_pool_space psc_space; + uint64_t psc_timestamp; + ABT_mutex psc_lock; +}; + /* Pool service */ struct pool_svc { struct ds_rsvc ps_rsvc; @@ -204,6 +210,7 @@ struct pool_svc { rdb_path_t ps_ops; /* metadata ops KVS */ int ps_error; /* in DB data (see pool_svc_lookup_leader) */ struct pool_svc_events ps_events; + struct pool_space_cache ps_space_cache; uint32_t ps_global_version; int ps_svc_rf; bool ps_force_notify; /* MS of PS membership */ @@ -1235,9 +1242,16 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) goto err_pool; } + rc = ABT_mutex_create(&svc->ps_space_cache.psc_lock); + if (rc != ABT_SUCCESS) { + D_ERROR("failed to create psc_lock: %d\n", rc); + rc = dss_abterr2der(rc); + goto err_lock; + } + rc = rdb_path_init(&svc->ps_root); if (rc != 0) - goto err_lock; + goto err_psc_lock; rc = rdb_path_push(&svc->ps_root, &rdb_path_root_key); if (rc != 0) goto err_root; @@ -1306,6 +1320,8 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) rdb_path_fini(&svc->ps_handles); err_root: rdb_path_fini(&svc->ps_root); +err_psc_lock: + ABT_mutex_free(&svc->ps_space_cache.psc_lock); err_lock: ABT_rwlock_free(&svc->ps_lock); err_pool: @@ -3872,8 +3888,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) struct pool_connect_in *in = crt_req_get(rpc); struct pool_connect_out *out = crt_reply_get(rpc); struct pool_svc *svc; - struct pool_buf *map_buf = NULL; - uint32_t map_version; uint32_t connectable; uint32_t global_ver; uint32_t obj_layout_ver; @@ -4095,12 +4109,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) goto out_map_version; } - rc = read_map_buf(&tx, &svc->ps_root, &map_buf, &map_version); - if (rc != 0) { - D_ERROR(DF_UUID": failed to read pool map: "DF_RC"\n", - DP_UUID(svc->ps_uuid), DP_RC(rc)); - D_GOTO(out_map_version, rc); - } transfer_map = true; if (skip_update) D_GOTO(out_map_version, rc = 0); @@ -4208,13 +4216,20 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); if (rc == 0 && transfer_map) { - rc = ds_pool_transfer_map_buf(map_buf, map_version, rpc, bulk, - &out->pco_map_buf_size); + struct ds_pool_map_bc *map_bc; + uint32_t map_version; + + rc = ds_pool_lookup_map_bc(svc->ps_pool, rpc->cr_ctx, &map_bc, &map_version); + if (rc == 0) { + rc = ds_pool_transfer_map_buf(map_bc, rpc, bulk, &out->pco_map_buf_size); + ds_pool_put_map_bc(map_bc); + /* Ensure the map version matches the map buffer. */ + out->pco_op.po_map_version = map_version; + } /** TODO: roll back tx if transfer fails? Perhaps rdb_tx_discard()? */ } if (rc == 0) rc = op_val.ov_rc; - D_FREE(map_buf); D_FREE(hdl); D_FREE(machine); if (prop) @@ -4487,8 +4502,23 @@ pool_space_query_bcast(crt_context_t ctx, struct pool_svc *svc, uuid_t pool_hdl, struct pool_tgt_query_in *in; struct pool_tgt_query_out *out; crt_rpc_t *rpc; + struct pool_space_cache *cache = &svc->ps_space_cache; + uint64_t cur_time = 0; + bool unlock = false; int rc; + if (ps_cache_intvl > 0) { + ABT_mutex_lock(cache->psc_lock); + + cur_time = daos_gettime_coarse(); + if (cur_time < cache->psc_timestamp + ps_cache_intvl) { + *ps = cache->psc_space; + ABT_mutex_unlock(cache->psc_lock); + return 0; + } + unlock = true; + } + D_DEBUG(DB_MD, DF_UUID": bcasting\n", DP_UUID(svc->ps_uuid)); rc = bcast_create(ctx, svc, POOL_TGT_QUERY, NULL, &rpc); @@ -4516,11 +4546,18 @@ pool_space_query_bcast(crt_context_t ctx, struct pool_svc *svc, uuid_t pool_hdl, } else { D_ASSERT(ps != NULL); *ps = out->tqo_space; + if (ps_cache_intvl > 0 && cur_time > cache->psc_timestamp) { + cache->psc_timestamp = cur_time; + cache->psc_space = *ps; + } } out_rpc: crt_req_decref(rpc); out: + if (unlock) + ABT_mutex_unlock(cache->psc_lock); + D_DEBUG(DB_MD, DF_UUID": bcasted: "DF_RC"\n", DP_UUID(svc->ps_uuid), DP_RC(rc)); return rc; @@ -4979,7 +5016,7 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) struct pool_query_in *in = crt_req_get(rpc); struct pool_query_out *out = crt_reply_get(rpc); daos_prop_t *prop = NULL; - struct pool_buf *map_buf; + struct ds_pool_map_bc *map_bc; uint32_t map_version = 0; struct pool_svc *svc; struct pool_metrics *metrics; @@ -5144,19 +5181,18 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) } } - rc = read_map_buf(&tx, &svc->ps_root, &map_buf, &map_version); - if (rc != 0) - D_ERROR(DF_UUID": failed to read pool map: "DF_RC"\n", - DP_UUID(svc->ps_uuid), DP_RC(rc)); - out_lock: ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); if (rc != 0) goto out_svc; - rc = ds_pool_transfer_map_buf(map_buf, map_version, rpc, bulk, &out->pqo_map_buf_size); - D_FREE(map_buf); + + rc = ds_pool_lookup_map_bc(svc->ps_pool, rpc->cr_ctx, &map_bc, &map_version); + if (rc != 0) + goto out_svc; + rc = ds_pool_transfer_map_buf(map_bc, rpc, bulk, &out->pqo_map_buf_size); + ds_pool_put_map_bc(map_bc); if (rc != 0) goto out_svc; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 95b70eb8b66..cc0539938be 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -930,6 +930,8 @@ pool_free_ref(struct daos_llink *llink) /** release metrics */ ds_pool_metrics_stop(pool); + if (pool->sp_map_bc != NULL) + ds_pool_put_map_bc(pool->sp_map_bc); ABT_cond_free(&pool->sp_fetch_hdls_cond); ABT_cond_free(&pool->sp_fetch_hdls_done_cond); ABT_mutex_free(&pool->sp_mutex); @@ -1830,6 +1832,110 @@ update_child_map(void *data) return 0; } +static int +map_bc_create(crt_context_t ctx, struct pool_map *map, struct ds_pool_map_bc **map_bc_out) +{ + struct ds_pool_map_bc *map_bc; + d_iov_t map_iov; + d_sg_list_t map_sgl; + int rc; + + D_ALLOC_PTR(map_bc); + if (map_bc == NULL) { + rc = -DER_NOMEM; + goto err; + } + + map_bc->pmc_ref = 1; + + rc = pool_buf_extract(map, &map_bc->pmc_buf); + if (rc != 0) { + DL_ERROR(rc, "failed to extract pool map buffer"); + goto err_map_bc; + } + + d_iov_set(&map_iov, map_bc->pmc_buf, pool_buf_size(map_bc->pmc_buf->pb_nr)); + map_sgl.sg_nr = 1; + map_sgl.sg_nr_out = 0; + map_sgl.sg_iovs = &map_iov; + + rc = crt_bulk_create(ctx, &map_sgl, CRT_BULK_RO, &map_bc->pmc_bulk); + if (rc != 0) + goto err_buf; + + *map_bc_out = map_bc; + return 0; + +err_buf: + D_FREE(map_bc->pmc_buf); +err_map_bc: + D_FREE(map_bc); +err: + return rc; +} + +static void +map_bc_get(struct ds_pool_map_bc *map_bc) +{ + map_bc->pmc_ref++; +} + +static void +map_bc_put(struct ds_pool_map_bc *map_bc) +{ + map_bc->pmc_ref--; + if (map_bc->pmc_ref == 0) { + crt_bulk_free(map_bc->pmc_bulk); + D_FREE(map_bc->pmc_buf); + D_FREE(map_bc); + } +} + +int +ds_pool_lookup_map_bc(struct ds_pool *pool, crt_context_t ctx, struct ds_pool_map_bc **map_bc_out, + uint32_t *map_version_out) +{ + struct ds_pool_map_bc *map_bc; + uint32_t map_version; + + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + + /* For accessing pool->sp_map, but not really necessary. */ + ABT_rwlock_rdlock(pool->sp_lock); + + if (pool->sp_map == NULL) { + ABT_rwlock_unlock(pool->sp_lock); + return -DER_NONEXIST; + } + + if (pool->sp_map_bc == NULL) { + int rc; + + rc = map_bc_create(ctx, pool->sp_map, &pool->sp_map_bc); + if (rc != 0) { + ABT_rwlock_unlock(pool->sp_lock); + return rc; + } + } + + map_bc_get(pool->sp_map_bc); + map_bc = pool->sp_map_bc; + map_version = pool_map_get_version(pool->sp_map); + + ABT_rwlock_unlock(pool->sp_lock); + + *map_bc_out = map_bc; + *map_version_out = map_version; + return 0; +} + +void +ds_pool_put_map_bc(struct ds_pool_map_bc *map_bc) +{ + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + map_bc_put(map_bc); +} + int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, unsigned int map_version) @@ -1886,6 +1992,12 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, pool->sp_map = map; map = tmp; + /* Invalidate pool->sp_map_bc. */ + if (pool->sp_map_bc != NULL) { + map_bc_put(pool->sp_map_bc); + pool->sp_map_bc = NULL; + } + map_updated = true; D_INFO(DF_UUID ": updated pool map: version=%u->%u pointer=%p->%p\n", DP_UUID(pool->sp_uuid), map == NULL ? 0 : pool_map_get_version(map), @@ -2105,7 +2217,7 @@ ds_pool_tgt_query_map_handler(crt_rpc_t *rpc) struct pool_tgt_query_map_in *in = crt_req_get(rpc); struct pool_tgt_query_map_out *out = crt_reply_get(rpc); struct ds_pool *pool; - struct pool_buf *buf; + struct ds_pool_map_bc *bc; unsigned int version; int rc; @@ -2155,22 +2267,19 @@ ds_pool_tgt_query_map_handler(crt_rpc_t *rpc) } /* Inefficient; better invent some zero-copy IV APIs. */ - ABT_rwlock_rdlock(pool->sp_lock); - version = (pool->sp_map == NULL ? 0 : pool_map_get_version(pool->sp_map)); + rc = ds_pool_lookup_map_bc(pool, rpc->cr_ctx, &bc, &version); + if (rc == -DER_NONEXIST) + version = 0; + else if (rc != 0) + goto out_pool; if (version <= in->tmi_map_version) { rc = 0; - ABT_rwlock_unlock(pool->sp_lock); goto out_version; } - rc = pool_buf_extract(pool->sp_map, &buf); - ABT_rwlock_unlock(pool->sp_lock); - if (rc != 0) - goto out_version; - rc = ds_pool_transfer_map_buf(buf, version, rpc, in->tmi_map_bulk, - &out->tmo_map_buf_size); + rc = ds_pool_transfer_map_buf(bc, rpc, in->tmi_map_bulk, &out->tmo_map_buf_size); - D_FREE(buf); + ds_pool_put_map_bc(bc); out_version: out->tmo_op.po_map_version = version; out_pool: diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index 29f012d5844..a69b3d559b3 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -149,22 +149,18 @@ bulk_cb(const struct crt_bulk_cb_info *cb_info) * pool map buffer size. */ int -ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, - crt_rpc_t *rpc, crt_bulk_t remote_bulk, +ds_pool_transfer_map_buf(struct ds_pool_map_bc *map_bc, crt_rpc_t *rpc, crt_bulk_t remote_bulk, uint32_t *required_buf_size) { size_t map_buf_size; daos_size_t remote_bulk_size; - d_iov_t map_iov; - d_sg_list_t map_sgl; - crt_bulk_t bulk; struct crt_bulk_desc map_desc; crt_bulk_opid_t map_opid; ABT_eventual eventual; int *status; int rc; - map_buf_size = pool_buf_size(map_buf->pb_nr); + map_buf_size = pool_buf_size(map_bc->pmc_buf->pb_nr); /* Check if the client bulk buffer is large enough. */ rc = crt_bulk_get_len(remote_bulk, &remote_bulk_size); @@ -176,28 +172,19 @@ ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, goto out; } - d_iov_set(&map_iov, map_buf, map_buf_size); - map_sgl.sg_nr = 1; - map_sgl.sg_nr_out = 0; - map_sgl.sg_iovs = &map_iov; - - rc = crt_bulk_create(rpc->cr_ctx, &map_sgl, CRT_BULK_RO, &bulk); - if (rc != 0) - goto out; - /* Prepare "map_desc" for crt_bulk_transfer(). */ map_desc.bd_rpc = rpc; map_desc.bd_bulk_op = CRT_BULK_PUT; map_desc.bd_remote_hdl = remote_bulk; map_desc.bd_remote_off = 0; - map_desc.bd_local_hdl = bulk; + map_desc.bd_local_hdl = map_bc->pmc_bulk; map_desc.bd_local_off = 0; - map_desc.bd_len = map_iov.iov_len; + map_desc.bd_len = map_buf_size; rc = ABT_eventual_create(sizeof(*status), &eventual); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_bulk; + goto out; } rc = crt_bulk_transfer(&map_desc, bulk_cb, &eventual, &map_opid); @@ -214,8 +201,6 @@ ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, out_eventual: ABT_eventual_free(&eventual); -out_bulk: - crt_bulk_free(bulk); out: return rc; } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 46db4891220..032a8183659 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -436,6 +437,10 @@ class EngineYamlParameters(YamlParameters): "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", "CRT_EVENT_DELAY=1", + # pylint: disable-next=fixme + # FIXME disable space cache since some tests need to verify instant pool space + # changing, this global setting to individual test setting once in follow-on PR. + "DAOS_POOL_SPACE_CACHE_INTVL=0", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [],