diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 70f5a4083ea..3f7de9f7f1d 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -38,6 +38,13 @@ struct ds_pool_svc; /* age of an entry in svc_ops KVS before it may be evicted */ #define DEFAULT_SVC_OPS_ENTRY_AGE_SEC_MAX 300ULL +/* Pool map buffer cache */ +struct ds_pool_map_bc { + struct pool_buf *pmc_buf; + crt_bulk_t pmc_bulk; + uint32_t pmc_ref; +}; + /* * Pool object * @@ -48,7 +55,8 @@ struct ds_pool { uuid_t sp_uuid; /* pool UUID */ d_list_t sp_hdls; ABT_rwlock sp_lock; - struct pool_map *sp_map; + struct pool_map *sp_map; + struct ds_pool_map_bc *sp_map_bc; uint32_t sp_map_version; /* temporary */ uint32_t sp_ec_cell_sz; uint64_t sp_reclaim; diff --git a/src/pool/srv.c b/src/pool/srv.c index 9c9611c04ef..df93ecf3a5f 100644 --- a/src/pool/srv.c +++ b/src/pool/srv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -22,6 +23,7 @@ bool ec_agg_disabled; uint32_t pw_rf = -1; /* pool wise redundancy factor */ +uint32_t ps_cache_intvl = 2; /* pool space cache expiration time, in seconds */ #define PW_RF_DEFAULT (2) #define PW_RF_MIN (0) #define PW_RF_MAX (4) @@ -76,6 +78,14 @@ init(void) pw_rf = PW_RF_DEFAULT; D_INFO("pool redundancy factor %d\n", pw_rf); + d_getenv_uint32_t("DAOS_POOL_SPACE_CACHE_INTVL", &ps_cache_intvl); + if (ps_cache_intvl > 20) { + D_WARN("pool space cache expiration time %u is too large, use default value\n", + ps_cache_intvl); + ps_cache_intvl = 2; + } + D_INFO("pool space cache expiration time set to %u seconds\n", ps_cache_intvl); + ds_pool_rsvc_class_register(); bio_register_ract_ops(&nvme_reaction_ops); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 612f7760fd1..87eb0cf9c31 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,6 +18,7 @@ #include extern uint32_t pw_rf; +extern uint32_t ps_cache_intvl; /** * Global pool metrics @@ -236,8 +238,10 @@ int ds_pool_tgt_prop_update(struct ds_pool *pool, struct pool_iv_prop *iv_prop); int ds_pool_tgt_connect(struct ds_pool *pool, struct pool_iv_conn *pic); void ds_pool_tgt_query_map_handler(crt_rpc_t *rpc); void ds_pool_tgt_discard_handler(crt_rpc_t *rpc); -void - ds_pool_tgt_warmup_handler(crt_rpc_t *rpc); +void ds_pool_tgt_warmup_handler(crt_rpc_t *rpc); +int ds_pool_lookup_map_bc(struct ds_pool *pool, crt_context_t ctx, + struct ds_pool_map_bc **map_bc_out, uint32_t *map_version_out); +void ds_pool_put_map_bc(struct ds_pool_map_bc *map_bc); /* * srv_util.c @@ -246,8 +250,7 @@ bool ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank); int ds_pool_plan_svc_reconfs(int svc_rf, struct pool_map *map, d_rank_list_t *replicas, d_rank_t self, bool filter_only, d_rank_list_t **to_add_out, d_rank_list_t **to_remove_out); -int ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, - crt_rpc_t *rpc, crt_bulk_t remote_bulk, +int ds_pool_transfer_map_buf(struct ds_pool_map_bc *map_bc, crt_rpc_t *rpc, crt_bulk_t remote_bulk, uint32_t *required_buf_size); extern struct bio_reaction_ops nvme_reaction_ops; diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 066d7bbd88a..a28534c40c0 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -191,6 +191,12 @@ sched_cancel_and_wait(struct pool_svc_sched *sched) sched_wait(sched); } +struct pool_space_cache { + struct daos_pool_space psc_space; + uint64_t psc_timestamp; + ABT_mutex psc_lock; +}; + /* Pool service */ struct pool_svc { struct ds_rsvc ps_rsvc; @@ -204,6 +210,7 @@ struct pool_svc { rdb_path_t ps_ops; /* metadata ops KVS */ int ps_error; /* in DB data (see pool_svc_lookup_leader) */ struct pool_svc_events ps_events; + struct pool_space_cache ps_space_cache; uint32_t ps_global_version; int ps_svc_rf; bool ps_force_notify; /* MS of PS membership */ @@ -1235,9 +1242,16 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) goto err_pool; } + rc = ABT_mutex_create(&svc->ps_space_cache.psc_lock); + if (rc != ABT_SUCCESS) { + D_ERROR("failed to create psc_lock: %d\n", rc); + rc = dss_abterr2der(rc); + goto err_lock; + } + rc = rdb_path_init(&svc->ps_root); if (rc != 0) - goto err_lock; + goto err_psc_lock; rc = rdb_path_push(&svc->ps_root, &rdb_path_root_key); if (rc != 0) goto err_root; @@ -1306,6 +1320,8 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) rdb_path_fini(&svc->ps_handles); err_root: rdb_path_fini(&svc->ps_root); +err_psc_lock: + ABT_mutex_free(&svc->ps_space_cache.psc_lock); err_lock: ABT_rwlock_free(&svc->ps_lock); err_pool: @@ -3872,8 +3888,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) struct pool_connect_in *in = crt_req_get(rpc); struct pool_connect_out *out = crt_reply_get(rpc); struct pool_svc *svc; - struct pool_buf *map_buf = NULL; - uint32_t map_version; uint32_t connectable; uint32_t global_ver; uint32_t obj_layout_ver; @@ -4095,12 +4109,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) goto out_map_version; } - rc = read_map_buf(&tx, &svc->ps_root, &map_buf, &map_version); - if (rc != 0) { - D_ERROR(DF_UUID": failed to read pool map: "DF_RC"\n", - DP_UUID(svc->ps_uuid), DP_RC(rc)); - D_GOTO(out_map_version, rc); - } transfer_map = true; if (skip_update) D_GOTO(out_map_version, rc = 0); @@ -4208,13 +4216,20 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); if (rc == 0 && transfer_map) { - rc = ds_pool_transfer_map_buf(map_buf, map_version, rpc, bulk, - &out->pco_map_buf_size); + struct ds_pool_map_bc *map_bc; + uint32_t map_version; + + rc = ds_pool_lookup_map_bc(svc->ps_pool, rpc->cr_ctx, &map_bc, &map_version); + if (rc == 0) { + rc = ds_pool_transfer_map_buf(map_bc, rpc, bulk, &out->pco_map_buf_size); + ds_pool_put_map_bc(map_bc); + /* Ensure the map version matches the map buffer. */ + out->pco_op.po_map_version = map_version; + } /** TODO: roll back tx if transfer fails? Perhaps rdb_tx_discard()? */ } if (rc == 0) rc = op_val.ov_rc; - D_FREE(map_buf); D_FREE(hdl); D_FREE(machine); if (prop) @@ -4487,8 +4502,23 @@ pool_space_query_bcast(crt_context_t ctx, struct pool_svc *svc, uuid_t pool_hdl, struct pool_tgt_query_in *in; struct pool_tgt_query_out *out; crt_rpc_t *rpc; + struct pool_space_cache *cache = &svc->ps_space_cache; + uint64_t cur_time = 0; + bool unlock = false; int rc; + if (ps_cache_intvl > 0) { + ABT_mutex_lock(cache->psc_lock); + + cur_time = daos_gettime_coarse(); + if (cur_time < cache->psc_timestamp + ps_cache_intvl) { + *ps = cache->psc_space; + ABT_mutex_unlock(cache->psc_lock); + return 0; + } + unlock = true; + } + D_DEBUG(DB_MD, DF_UUID": bcasting\n", DP_UUID(svc->ps_uuid)); rc = bcast_create(ctx, svc, POOL_TGT_QUERY, NULL, &rpc); @@ -4516,11 +4546,18 @@ pool_space_query_bcast(crt_context_t ctx, struct pool_svc *svc, uuid_t pool_hdl, } else { D_ASSERT(ps != NULL); *ps = out->tqo_space; + if (ps_cache_intvl > 0 && cur_time > cache->psc_timestamp) { + cache->psc_timestamp = cur_time; + cache->psc_space = *ps; + } } out_rpc: crt_req_decref(rpc); out: + if (unlock) + ABT_mutex_unlock(cache->psc_lock); + D_DEBUG(DB_MD, DF_UUID": bcasted: "DF_RC"\n", DP_UUID(svc->ps_uuid), DP_RC(rc)); return rc; @@ -4979,7 +5016,7 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) struct pool_query_in *in = crt_req_get(rpc); struct pool_query_out *out = crt_reply_get(rpc); daos_prop_t *prop = NULL; - struct pool_buf *map_buf; + struct ds_pool_map_bc *map_bc; uint32_t map_version = 0; struct pool_svc *svc; struct pool_metrics *metrics; @@ -5144,19 +5181,18 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) } } - rc = read_map_buf(&tx, &svc->ps_root, &map_buf, &map_version); - if (rc != 0) - D_ERROR(DF_UUID": failed to read pool map: "DF_RC"\n", - DP_UUID(svc->ps_uuid), DP_RC(rc)); - out_lock: ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); if (rc != 0) goto out_svc; - rc = ds_pool_transfer_map_buf(map_buf, map_version, rpc, bulk, &out->pqo_map_buf_size); - D_FREE(map_buf); + + rc = ds_pool_lookup_map_bc(svc->ps_pool, rpc->cr_ctx, &map_bc, &map_version); + if (rc != 0) + goto out_svc; + rc = ds_pool_transfer_map_buf(map_bc, rpc, bulk, &out->pqo_map_buf_size); + ds_pool_put_map_bc(map_bc); if (rc != 0) goto out_svc; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 95b70eb8b66..cc0539938be 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -930,6 +930,8 @@ pool_free_ref(struct daos_llink *llink) /** release metrics */ ds_pool_metrics_stop(pool); + if (pool->sp_map_bc != NULL) + ds_pool_put_map_bc(pool->sp_map_bc); ABT_cond_free(&pool->sp_fetch_hdls_cond); ABT_cond_free(&pool->sp_fetch_hdls_done_cond); ABT_mutex_free(&pool->sp_mutex); @@ -1830,6 +1832,110 @@ update_child_map(void *data) return 0; } +static int +map_bc_create(crt_context_t ctx, struct pool_map *map, struct ds_pool_map_bc **map_bc_out) +{ + struct ds_pool_map_bc *map_bc; + d_iov_t map_iov; + d_sg_list_t map_sgl; + int rc; + + D_ALLOC_PTR(map_bc); + if (map_bc == NULL) { + rc = -DER_NOMEM; + goto err; + } + + map_bc->pmc_ref = 1; + + rc = pool_buf_extract(map, &map_bc->pmc_buf); + if (rc != 0) { + DL_ERROR(rc, "failed to extract pool map buffer"); + goto err_map_bc; + } + + d_iov_set(&map_iov, map_bc->pmc_buf, pool_buf_size(map_bc->pmc_buf->pb_nr)); + map_sgl.sg_nr = 1; + map_sgl.sg_nr_out = 0; + map_sgl.sg_iovs = &map_iov; + + rc = crt_bulk_create(ctx, &map_sgl, CRT_BULK_RO, &map_bc->pmc_bulk); + if (rc != 0) + goto err_buf; + + *map_bc_out = map_bc; + return 0; + +err_buf: + D_FREE(map_bc->pmc_buf); +err_map_bc: + D_FREE(map_bc); +err: + return rc; +} + +static void +map_bc_get(struct ds_pool_map_bc *map_bc) +{ + map_bc->pmc_ref++; +} + +static void +map_bc_put(struct ds_pool_map_bc *map_bc) +{ + map_bc->pmc_ref--; + if (map_bc->pmc_ref == 0) { + crt_bulk_free(map_bc->pmc_bulk); + D_FREE(map_bc->pmc_buf); + D_FREE(map_bc); + } +} + +int +ds_pool_lookup_map_bc(struct ds_pool *pool, crt_context_t ctx, struct ds_pool_map_bc **map_bc_out, + uint32_t *map_version_out) +{ + struct ds_pool_map_bc *map_bc; + uint32_t map_version; + + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + + /* For accessing pool->sp_map, but not really necessary. */ + ABT_rwlock_rdlock(pool->sp_lock); + + if (pool->sp_map == NULL) { + ABT_rwlock_unlock(pool->sp_lock); + return -DER_NONEXIST; + } + + if (pool->sp_map_bc == NULL) { + int rc; + + rc = map_bc_create(ctx, pool->sp_map, &pool->sp_map_bc); + if (rc != 0) { + ABT_rwlock_unlock(pool->sp_lock); + return rc; + } + } + + map_bc_get(pool->sp_map_bc); + map_bc = pool->sp_map_bc; + map_version = pool_map_get_version(pool->sp_map); + + ABT_rwlock_unlock(pool->sp_lock); + + *map_bc_out = map_bc; + *map_version_out = map_version; + return 0; +} + +void +ds_pool_put_map_bc(struct ds_pool_map_bc *map_bc) +{ + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + map_bc_put(map_bc); +} + int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, unsigned int map_version) @@ -1886,6 +1992,12 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, pool->sp_map = map; map = tmp; + /* Invalidate pool->sp_map_bc. */ + if (pool->sp_map_bc != NULL) { + map_bc_put(pool->sp_map_bc); + pool->sp_map_bc = NULL; + } + map_updated = true; D_INFO(DF_UUID ": updated pool map: version=%u->%u pointer=%p->%p\n", DP_UUID(pool->sp_uuid), map == NULL ? 0 : pool_map_get_version(map), @@ -2105,7 +2217,7 @@ ds_pool_tgt_query_map_handler(crt_rpc_t *rpc) struct pool_tgt_query_map_in *in = crt_req_get(rpc); struct pool_tgt_query_map_out *out = crt_reply_get(rpc); struct ds_pool *pool; - struct pool_buf *buf; + struct ds_pool_map_bc *bc; unsigned int version; int rc; @@ -2155,22 +2267,19 @@ ds_pool_tgt_query_map_handler(crt_rpc_t *rpc) } /* Inefficient; better invent some zero-copy IV APIs. */ - ABT_rwlock_rdlock(pool->sp_lock); - version = (pool->sp_map == NULL ? 0 : pool_map_get_version(pool->sp_map)); + rc = ds_pool_lookup_map_bc(pool, rpc->cr_ctx, &bc, &version); + if (rc == -DER_NONEXIST) + version = 0; + else if (rc != 0) + goto out_pool; if (version <= in->tmi_map_version) { rc = 0; - ABT_rwlock_unlock(pool->sp_lock); goto out_version; } - rc = pool_buf_extract(pool->sp_map, &buf); - ABT_rwlock_unlock(pool->sp_lock); - if (rc != 0) - goto out_version; - rc = ds_pool_transfer_map_buf(buf, version, rpc, in->tmi_map_bulk, - &out->tmo_map_buf_size); + rc = ds_pool_transfer_map_buf(bc, rpc, in->tmi_map_bulk, &out->tmo_map_buf_size); - D_FREE(buf); + ds_pool_put_map_bc(bc); out_version: out->tmo_op.po_map_version = version; out_pool: diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index 29f012d5844..a69b3d559b3 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -149,22 +149,18 @@ bulk_cb(const struct crt_bulk_cb_info *cb_info) * pool map buffer size. */ int -ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, - crt_rpc_t *rpc, crt_bulk_t remote_bulk, +ds_pool_transfer_map_buf(struct ds_pool_map_bc *map_bc, crt_rpc_t *rpc, crt_bulk_t remote_bulk, uint32_t *required_buf_size) { size_t map_buf_size; daos_size_t remote_bulk_size; - d_iov_t map_iov; - d_sg_list_t map_sgl; - crt_bulk_t bulk; struct crt_bulk_desc map_desc; crt_bulk_opid_t map_opid; ABT_eventual eventual; int *status; int rc; - map_buf_size = pool_buf_size(map_buf->pb_nr); + map_buf_size = pool_buf_size(map_bc->pmc_buf->pb_nr); /* Check if the client bulk buffer is large enough. */ rc = crt_bulk_get_len(remote_bulk, &remote_bulk_size); @@ -176,28 +172,19 @@ ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, goto out; } - d_iov_set(&map_iov, map_buf, map_buf_size); - map_sgl.sg_nr = 1; - map_sgl.sg_nr_out = 0; - map_sgl.sg_iovs = &map_iov; - - rc = crt_bulk_create(rpc->cr_ctx, &map_sgl, CRT_BULK_RO, &bulk); - if (rc != 0) - goto out; - /* Prepare "map_desc" for crt_bulk_transfer(). */ map_desc.bd_rpc = rpc; map_desc.bd_bulk_op = CRT_BULK_PUT; map_desc.bd_remote_hdl = remote_bulk; map_desc.bd_remote_off = 0; - map_desc.bd_local_hdl = bulk; + map_desc.bd_local_hdl = map_bc->pmc_bulk; map_desc.bd_local_off = 0; - map_desc.bd_len = map_iov.iov_len; + map_desc.bd_len = map_buf_size; rc = ABT_eventual_create(sizeof(*status), &eventual); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_bulk; + goto out; } rc = crt_bulk_transfer(&map_desc, bulk_cb, &eventual, &map_opid); @@ -214,8 +201,6 @@ ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, out_eventual: ABT_eventual_free(&eventual); -out_bulk: - crt_bulk_free(bulk); out: return rc; } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 46db4891220..032a8183659 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -436,6 +437,10 @@ class EngineYamlParameters(YamlParameters): "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", "CRT_EVENT_DELAY=1", + # pylint: disable-next=fixme + # FIXME disable space cache since some tests need to verify instant pool space + # changing, this global setting to individual test setting once in follow-on PR. + "DAOS_POOL_SPACE_CACHE_INTVL=0", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [],