From 8fe77b9ae1f2e0a51a4d23af5981f65a1cda0236 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Mon, 3 Feb 2025 17:01:38 -0600 Subject: [PATCH] DAOS-16990 cart: workaround to CXI init errors with retrying HG init (#15833) This is a workaround for DAOS-16990 and DAOS-17011. When using the CXI provider, retry HG_Init_opt2() on error cases since it seems CXI has intermittent issues on initialization. A new environment variable is added (CRT_CXI_INIT_RETRY) to control the retry count (default is 3) and to be able to test future SS fixes without retry. Signed-off-by: Mohamad Chaarawi --- src/cart/README.env | 2 ++ src/cart/crt_hg.c | 12 +++++++++++- src/cart/crt_init.c | 7 +++++++ src/cart/crt_internal_types.h | 4 ++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/cart/README.env b/src/cart/README.env index 6457ea19467..6bad14ad41e 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -211,3 +211,5 @@ This file lists the environment variables used in CaRT. traffic congestion. Available options are: "unspec" (default), "best_effort", "low_latency", "bulk_data". + . CRT_CXI_INIT_RETRY + Retry count for HG_Init_opt2() when initializing the CXI provider (default = 3). diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 8f2395a44ff..f1dd6a3f2c3 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -834,6 +835,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ char addr_str[CRT_ADDR_STR_MAX_LEN] = {'\0'}; size_t str_size = CRT_ADDR_STR_MAX_LEN; struct crt_prov_gdata *prov_data; + uint32_t retry_count = 0; int rc = DER_SUCCESS; prov_data = crt_get_prov_gdata(primary, provider); @@ -869,9 +871,17 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; if (thread_mode_single) init_info.na_init_info.thread_mode = NA_THREAD_MODE_SINGLE; - +retry: hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { + /** workaround for DAOS-16990, DAOS-17011 - retry a few times on init */ + if (provider == CRT_PROV_OFI_CXI && !crt_is_service() && + retry_count < crt_gdata.cg_hg_init_retry_cnt) { + retry_count++; + D_WARN("Could not initialize HG class; retrying (%d)\n", retry_count); + sleep(retry_count * 5); + goto retry; + } D_ERROR("Could not initialize HG class.\n"); D_GOTO(out, rc = -DER_HG); } diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index be43b8d1f1a..d66d99cd65c 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -281,11 +282,17 @@ data_init(int server, crt_init_options_t *opt) if (mem_pin_enable == 1) mem_pin_workaround(); } else { + int retry_count = 3; + /* * Client-side envariable to indicate that the cluster * is running using a secondary provider */ crt_env_get(CRT_SECONDARY_PROVIDER, &is_secondary); + + /** Client side env for hg_init() retries */ + crt_env_get(CRT_CXI_INIT_RETRY, &retry_count); + crt_gdata.cg_hg_init_retry_cnt = retry_count; } crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index f10bf38d7c8..d35148c2bfe 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -170,6 +171,8 @@ struct crt_gdata { long cg_num_cores; /** Inflight rpc quota limit */ uint32_t cg_rpc_quota; + /** Retry count of HG_Init_opt2() on failure when using CXI provider */ + uint32_t cg_hg_init_retry_cnt; }; extern struct crt_gdata crt_gdata; @@ -197,6 +200,7 @@ struct crt_event_cb_priv { ENV_STR(CRT_ATTACH_INFO_PATH) \ ENV(CRT_CREDIT_EP_CTX) \ ENV(CRT_CTX_NUM) \ + ENV(CRT_CXI_INIT_RETRY) \ ENV(CRT_ENABLE_MEM_PIN) \ ENV_STR(CRT_L_GRP_CFG) \ ENV(CRT_L_RANK) \