Skip to content

Commit

Permalink
DAOS-16916 container: serialize container open (#15796)
Browse files Browse the repository at this point in the history
Serialize container open by ABT mutex.

Remove incorrect sc_open assertion, which might be faulty if iv fetch is failed
and following container open tried again.

Signed-off-by: Di Wang <[email protected]>
  • Loading branch information
wangdi1 authored Jan 27, 2025
1 parent fe0e820 commit 8e2a0d8
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 41 deletions.
55 changes: 20 additions & 35 deletions src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,57 +1610,54 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
D_ASSERT(hdl->sch_cont->sc_pool != NULL);

hdl->sch_cont->sc_open++;
if (hdl->sch_cont->sc_open > 1) {
/* If there is an inflight open being stuck, then
* let's retry and wait until it finished.
*/
if (hdl->sch_cont->sc_open_initializing) {
hdl->sch_cont->sc_open--;
D_GOTO(err_cont, rc = -DER_AGAIN);
}

/* Only go through if the 1st open succeeds */
if (hdl->sch_cont->sc_props_fetched)
goto opened;
}
if (hdl->sch_cont->sc_open > 1 && hdl->sch_cont->sc_props_fetched)
goto opened;

hdl->sch_cont->sc_open_initializing = 1;
if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
goto csum_init;

/* Since the open process might be yield, and concurrent open might cause
* some issues, so let's serialize the process */
ABT_mutex_lock(hdl->sch_cont->sc_mutex);
if (hdl->sch_cont->sc_open > 1 && hdl->sch_cont->sc_props_fetched) {
ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
goto opened;
}

rc = dtx_cont_open(hdl->sch_cont);
if (rc != 0) {
D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont "
DF_UUID": %d\n", DP_UUID(cont_uuid), hdl->sch_cont->sc_open);

ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
hdl->sch_cont->sc_open--;
D_GOTO(err_cont, rc);
}

D_ALLOC_PTR(ddra);
if (ddra == NULL)
if (ddra == NULL) {
ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
D_GOTO(err_dtx, rc = -DER_NOMEM);
}

ddra->pool = ds_pool_child_lookup(hdl->sch_cont->sc_pool->spc_uuid);
if (ddra->pool == NULL) {
ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
D_FREE(ddra);
D_GOTO(err_dtx, rc = -DER_NO_HDL);
}
uuid_copy(ddra->co_uuid, cont_uuid);
rc = dss_ult_create(ds_dtx_resync, ddra, DSS_XS_SELF,
0, 0, NULL);
if (rc != 0) {
ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
ds_pool_child_put(hdl->sch_cont->sc_pool);
D_FREE(ddra);
D_GOTO(err_dtx, rc);
}

csum_init:
rc = ds_cont_csummer_init(hdl->sch_cont);
ABT_mutex_unlock(hdl->sch_cont->sc_mutex);
if (rc != 0)
D_GOTO(err_dtx, rc);

hdl->sch_cont->sc_open_initializing = 0;
}
opened:
if (cont_hdl != NULL) {
Expand All @@ -1671,14 +1668,10 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
return 0;

err_dtx:
D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont "
DF_UUID": %d\n", DP_UUID(cont_uuid), hdl->sch_cont->sc_open);

hdl->sch_cont->sc_open--;
dtx_cont_close(hdl->sch_cont, true);

err_cont:
hdl->sch_cont->sc_open_initializing = 0;
if (daos_handle_is_valid(poh)) {
int rc_tmp;

Expand Down Expand Up @@ -1766,25 +1759,17 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
D_DEBUG(DB_TRACE, "open pool/cont/hdl "DF_UUID"/"DF_UUID"/"DF_UUID"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid));

retry:
rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0);
if (rc != 0) {
if (rc == -DER_AGAIN) {
dss_sleep(50);
goto retry;
}

if (rc != 0)
/* Once it exclude the target from the pool, since the target
* might still in the cart group, so IV cont open might still
* come to this target, especially if cont open/close will be
* done by IV asynchronously, so this cont_open_one might return
* -DER_NO_HDL if it can not find pool handle. (DAOS-3185)
*/
D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid),
DP_UUID(cont_hdl_uuid), DP_RC(rc));
}
D_ERROR("open " DF_UUID "/" DF_UUID "/" DF_UUID ":" DF_RC "\n", DP_UUID(pool_uuid),
DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid), DP_RC(rc));

return rc;
}
Expand Down
2 changes: 1 addition & 1 deletion src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1912,7 +1913,6 @@ dtx_cont_open(struct ds_cont_child *cont)
int rc;

D_ASSERT(cont != NULL);
D_ASSERT(cont->sc_open == 1);

d_list_for_each_entry(dbpa, &dmi->dmi_dtx_batched_pool_list, dbpa_sys_link) {
if (dbpa->dbpa_pool != cont->sc_pool)
Expand Down
19 changes: 14 additions & 5 deletions src/include/daos_srv/container.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,20 @@ struct ds_cont_child {
ABT_cond sc_scrub_cond;
ABT_cond sc_rebuild_cond;
ABT_cond sc_fini_cond;
uint32_t sc_dtx_resyncing : 1, sc_dtx_reindex : 1, sc_dtx_reindex_abort : 1,
sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_props_fetched : 1, sc_stopping : 1,
sc_destroying : 1, sc_vos_agg_active : 1, sc_ec_agg_active : 1,
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
sc_rw_disabled : 1, sc_scrubbing : 1, sc_rebuilding : 1, sc_open_initializing : 1;
uint32_t sc_dtx_resyncing : 1;
uint32_t sc_dtx_reindex : 1;
uint32_t sc_dtx_reindex_abort : 1;
uint32_t sc_dtx_delay_reset : 1;
uint32_t sc_dtx_registered : 1;
uint32_t sc_props_fetched : 1;
uint32_t sc_stopping : 1;
uint32_t sc_destroying : 1;
uint32_t sc_vos_agg_active : 1;
uint32_t sc_ec_agg_active : 1;
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
uint32_t sc_rw_disabled : 1;
uint32_t sc_scrubbing : 1;
uint32_t sc_rebuilding : 1;
uint32_t sc_dtx_batched_gen;
/* Tracks the schedule request for aggregation ULT */
struct sched_request *sc_agg_req;
Expand Down

0 comments on commit 8e2a0d8

Please sign in to comment.