From 3fc16d3da649bf9691a76f6732e5838b66869526 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 1 May 2023 18:16:53 +0000 Subject: [PATCH 01/11] DAOS-13268 cart: Fix UCX to use rdmacm. Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 884a2ac233d..44280d1d836 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,6 +307,8 @@ static int data_init(int server, crt_init_options_t *opt) if (server) setenv("UCX_IB_FORK_INIT", "n", 1); + setenv(" UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); + /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); if (fi_univ_size == 0) { From 9c0622b40d4ef39dccf7b7761d5aa05986767ceb Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 1 May 2023 20:15:31 +0000 Subject: [PATCH 02/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 44280d1d836..bf9da0e89a5 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,7 +307,9 @@ static int data_init(int server, crt_init_options_t *opt) if (server) setenv("UCX_IB_FORK_INIT", "n", 1); - setenv(" UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); + setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); + setenv("UCX_LOG_LEVEL", "diag", 1); + setenv("D_LOG_STDERR_IN_LOG", "1", 1); /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); From 198bc340c624e54521be75df73353a98f8bbb5f6 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 2 May 2023 01:17:30 +0000 Subject: [PATCH 03/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index bf9da0e89a5..8b3fb3d63b4 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -269,7 +269,7 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; timeout = 0; - +G if (opt && opt->cio_crt_timeout != 0) timeout = opt->cio_crt_timeout; else @@ -308,8 +308,9 @@ static int data_init(int server, crt_init_options_t *opt) setenv("UCX_IB_FORK_INIT", "n", 1); setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); - setenv("UCX_LOG_LEVEL", "diag", 1); - setenv("D_LOG_STDERR_IN_LOG", "1", 1); + setenv("HG_LOG_LEVEL", "warning", 1); + setenv("HG_LOG_SUBSYS", "na", 1); + //setenv("D_LOG_STDERR_IN_LOG", "1", 1); /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); From 9a216be8a45ffad955a276cf5535b27450ed8243 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 2 May 2023 01:24:34 +0000 Subject: [PATCH 04/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 8b3fb3d63b4..3d38b185145 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,10 +307,11 @@ G if (server) setenv("UCX_IB_FORK_INIT", "n", 1); - setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); + //setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); setenv("HG_LOG_LEVEL", "warning", 1); + setenv("UCX_LOG_LEVEL", "diag", 1); setenv("HG_LOG_SUBSYS", "na", 1); - //setenv("D_LOG_STDERR_IN_LOG", "1", 1); + setenv("D_LOG_STDERR_IN_LOG", "1", 1); /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); From 937c8705a5ecccb36027b02110112531cc00ed89 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 2 May 2023 01:37:33 +0000 Subject: [PATCH 05/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 3d38b185145..448ebe2d25f 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -269,7 +269,7 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; timeout = 0; -G + if (opt && opt->cio_crt_timeout != 0) timeout = opt->cio_crt_timeout; else From 9a51e920e44c9558c13db99c8e9efaa5a74c7e08 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 2 May 2023 15:38:50 +0000 Subject: [PATCH 06/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-verbs-provider: true Skip-func-hw-test-medium-ucx-provider: false Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 448ebe2d25f..3e2e8895b8a 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,7 +307,8 @@ static int data_init(int server, crt_init_options_t *opt) if (server) setenv("UCX_IB_FORK_INIT", "n", 1); - //setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); + setenv("UCX_WARN_UNUSED_ENV_VARS", "n", 1); + setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); setenv("HG_LOG_LEVEL", "warning", 1); setenv("UCX_LOG_LEVEL", "diag", 1); setenv("HG_LOG_SUBSYS", "na", 1); From 0e04cd749c0eef425788bc48d9ccf6834501f8f3 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 2 May 2023 23:01:38 +0000 Subject: [PATCH 07/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-verbs-provider: true Skip-func-hw-test-medium-ucx-provider: false Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_hg.c | 1 + src/cart/crt_init.c | 5 ----- src/tests/ftest/launch.py | 3 +++ 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index a66c43c0c31..71dfb4c8993 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -730,6 +730,7 @@ crt_hg_init(void) env = getenv("HG_LOG_LEVEL"); if (!env) HG_Set_log_level("warning"); + HG_Set_log_subsys("hg,na"); } /* import HG log */ diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 3e2e8895b8a..456b9632248 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,11 +307,6 @@ static int data_init(int server, crt_init_options_t *opt) if (server) setenv("UCX_IB_FORK_INIT", "n", 1); - setenv("UCX_WARN_UNUSED_ENV_VARS", "n", 1); - setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); - setenv("HG_LOG_LEVEL", "warning", 1); - setenv("UCX_LOG_LEVEL", "diag", 1); - setenv("HG_LOG_SUBSYS", "na", 1); setenv("D_LOG_STDERR_IN_LOG", "1", 1); /* This is a workaround for CART-871 if universe size is not set */ diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 768d97e5711..0dab233944e 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -1053,6 +1053,9 @@ def _set_test_environment(self, servers, clients, list_tests, provider, insecure path = os.environ.get("PATH") os.environ["COVFILE"] = BULLSEYE_FILE + os.environ["UCX_LOG_LEVEL"] = "trace" + os.environ["D_LOG_STDERR_IN_LOG"] = "1" + if not list_tests: # Get the default fabric_iface value (DAOS_TEST_FABRIC_IFACE) self._set_interface_environment(servers, clients) From 721e0cdc084939c10a1ce1e6fa4865b9269c66f0 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 4 May 2023 22:33:58 +0000 Subject: [PATCH 08/11] DAOS-13268 cart: Fix UCX. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/daos_test/dfs.yaml | 1 + src/tests/ftest/ior/small.yaml | 1 + src/tests/ftest/launch.py | 3 --- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/daos_test/dfs.yaml b/src/tests/ftest/daos_test/dfs.yaml index dad686ceae5..83c42e38ede 100644 --- a/src/tests/ftest/daos_test/dfs.yaml +++ b/src/tests/ftest/daos_test/dfs.yaml @@ -13,6 +13,7 @@ pool: server_config: name: daos_server engines_per_host: 2 + crt_timeout: 60 engines: 0: pinned_numa_node: 0 diff --git a/src/tests/ftest/ior/small.yaml b/src/tests/ftest/ior/small.yaml index ccc835b4ff9..d8717172e27 100644 --- a/src/tests/ftest/ior/small.yaml +++ b/src/tests/ftest/ior/small.yaml @@ -5,6 +5,7 @@ timeout: 700 server_config: name: daos_server engines_per_host: 2 + crt_timeout: 60 engines: 0: pinned_numa_node: 0 diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 0dab233944e..768d97e5711 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -1053,9 +1053,6 @@ def _set_test_environment(self, servers, clients, list_tests, provider, insecure path = os.environ.get("PATH") os.environ["COVFILE"] = BULLSEYE_FILE - os.environ["UCX_LOG_LEVEL"] = "trace" - os.environ["D_LOG_STDERR_IN_LOG"] = "1" - if not list_tests: # Get the default fabric_iface value (DAOS_TEST_FABRIC_IFACE) self._set_interface_environment(servers, clients) From 75666de453312999861fd59d36702421746a793c Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 5 May 2023 14:51:17 +0000 Subject: [PATCH 09/11] Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-scan-leap15-rpms: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/mdtest/small.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/ftest/mdtest/small.yaml b/src/tests/ftest/mdtest/small.yaml index dfe62f2328a..261f8ce65c9 100644 --- a/src/tests/ftest/mdtest/small.yaml +++ b/src/tests/ftest/mdtest/small.yaml @@ -5,6 +5,7 @@ timeout: 360 server_config: name: daos_server engines_per_host: 2 + crt_timeout: 60 engines: 0: pinned_numa_node: 0 From f9e2ffc8603bdc325d0e3edf8d3231d15a401534 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 19 May 2023 15:45:01 +0000 Subject: [PATCH 10/11] DAOS-13268 cart: Fix UCX. Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 456b9632248..64b4f83f7c6 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -308,6 +308,7 @@ static int data_init(int server, crt_init_options_t *opt) setenv("UCX_IB_FORK_INIT", "n", 1); setenv("D_LOG_STDERR_IN_LOG", "1", 1); + setenv("UCX_DC_MLX5_NUM_DCI", "16", 1); /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); From 1405ffcb014f7c27e30e49e3b8f8ffcc86d67a9c Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 31 May 2023 14:20:15 +0000 Subject: [PATCH 11/11] DAOS-13268 cart: Upgrade to UCX 1.14. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:2 Required-githooks: true Signed-off-by: Joseph Moore --- .../post_provision_config_common_functions.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index afe7b9c70ca..260ec877270 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -273,11 +273,6 @@ post_provision_config_nodes() { return 1 fi - if lspci | grep "ConnectX-6" && ! grep MOFED_VERSION /etc/do-release; then - # Remove OPA and install MOFED - install_mofed - fi - if [ -n "$INST_REPOS" ]; then local repo for repo in $INST_REPOS; do @@ -309,6 +304,11 @@ post_provision_config_nodes() { fi fi + if lspci | grep "ConnectX-6" && ! grep MOFED_VERSION /etc/do-release; then + # Remove OPA and install MOFED + install_mofed + fi + # shellcheck disable=SC2001 if ! rpm -q "$(echo "$INST_RPMS" | sed -e 's/--exclude [^ ]*//' \