Skip to content

Commit

Permalink
DAOS-16990 cart: workaround to CXI init errors with retrying HG init (#…
Browse files Browse the repository at this point in the history
…15833)

This is a workaround for DAOS-16990 and DAOS-17011.

When using the CXI provider, retry HG_Init_opt2() on error cases since
it seems CXI has intermittent issues on initialization. A new
environment variable is added (CRT_CXI_INIT_RETRY) to control the retry
count (default is 3) and to be able to test future SS fixes without
retry.

Signed-off-by: Mohamad Chaarawi <[email protected]>
  • Loading branch information
mchaarawi authored Feb 3, 2025
1 parent aee0149 commit 8fe77b9
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,5 @@ This file lists the environment variables used in CaRT.
traffic congestion. Available options are: "unspec" (default), "best_effort",
"low_latency", "bulk_data".

. CRT_CXI_INIT_RETRY
Retry count for HG_Init_opt2() when initializing the CXI provider (default = 3).
12 changes: 11 additions & 1 deletion src/cart/crt_hg.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -834,6 +835,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
char addr_str[CRT_ADDR_STR_MAX_LEN] = {'\0'};
size_t str_size = CRT_ADDR_STR_MAX_LEN;
struct crt_prov_gdata *prov_data;
uint32_t retry_count = 0;
int rc = DER_SUCCESS;

prov_data = crt_get_prov_gdata(primary, provider);
Expand Down Expand Up @@ -869,9 +871,17 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
if (thread_mode_single)
init_info.na_init_info.thread_mode = NA_THREAD_MODE_SINGLE;

retry:
hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
if (hg_class == NULL) {
/** workaround for DAOS-16990, DAOS-17011 - retry a few times on init */
if (provider == CRT_PROV_OFI_CXI && !crt_is_service() &&
retry_count < crt_gdata.cg_hg_init_retry_cnt) {
retry_count++;
D_WARN("Could not initialize HG class; retrying (%d)\n", retry_count);
sleep(retry_count * 5);
goto retry;
}
D_ERROR("Could not initialize HG class.\n");
D_GOTO(out, rc = -DER_HG);
}
Expand Down
7 changes: 7 additions & 0 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -281,11 +282,17 @@ data_init(int server, crt_init_options_t *opt)
if (mem_pin_enable == 1)
mem_pin_workaround();
} else {
int retry_count = 3;

/*
* Client-side envariable to indicate that the cluster
* is running using a secondary provider
*/
crt_env_get(CRT_SECONDARY_PROVIDER, &is_secondary);

/** Client side env for hg_init() retries */
crt_env_get(CRT_CXI_INIT_RETRY, &retry_count);
crt_gdata.cg_hg_init_retry_cnt = retry_count;
}
crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1;

Expand Down
4 changes: 4 additions & 0 deletions src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -170,6 +171,8 @@ struct crt_gdata {
long cg_num_cores;
/** Inflight rpc quota limit */
uint32_t cg_rpc_quota;
/** Retry count of HG_Init_opt2() on failure when using CXI provider */
uint32_t cg_hg_init_retry_cnt;
};

extern struct crt_gdata crt_gdata;
Expand Down Expand Up @@ -197,6 +200,7 @@ struct crt_event_cb_priv {
ENV_STR(CRT_ATTACH_INFO_PATH) \
ENV(CRT_CREDIT_EP_CTX) \
ENV(CRT_CTX_NUM) \
ENV(CRT_CXI_INIT_RETRY) \
ENV(CRT_ENABLE_MEM_PIN) \
ENV_STR(CRT_L_GRP_CFG) \
ENV(CRT_L_RANK) \
Expand Down

0 comments on commit 8fe77b9

Please sign in to comment.