From f230886afba111e1aec6a40ecaf8761965c9f398 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 15 May 2023 21:22:30 +0000 Subject: [PATCH 01/14] DAOS-13386 cart: upgrade to UCX 1.14. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:2 Signed-off-by: Joseph Moore --- ci/provisioning/post_provision_config_nodes_EL_8.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/provisioning/post_provision_config_nodes_EL_8.sh b/ci/provisioning/post_provision_config_nodes_EL_8.sh index 57f5af8cabc..4a212cfe38b 100644 --- a/ci/provisioning/post_provision_config_nodes_EL_8.sh +++ b/ci/provisioning/post_provision_config_nodes_EL_8.sh @@ -64,7 +64,7 @@ install_mofed() { rm -f RPM-GPG-KEY-Mellanox dnf repolist || true - time dnf -y install mlnx-ofed-basic ucx-cma ucx-ib ucx-knem ucx-rdmacm ucx-xpmem + time dnf -y install mlnx-ofed-basic # now, upgrade firmware time dnf -y install mlnx-fw-updater From c33d0237a6c085f29d344adc1202b70865a45143 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 15 May 2023 22:54:59 +0000 Subject: [PATCH 02/14] DAOS-13386 cart: Upgrade to UCX 1.14 Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-scan-rpms: rpms Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:2 Signed-off-by: Joseph Moore From b16419604addc981055abe152990e3379ff6bb56 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 15 May 2023 23:08:29 +0000 Subject: [PATCH 03/14] DAOS-13386 cart: Upgrade to UCX 1.14. Sikip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9 Signed-off-by: Joseph Moore From 886a34f37c08b683c6ddfe9935f8432e5a991851 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 19 May 2023 17:52:11 +0000 Subject: [PATCH 04/14] DAOS-13386 cart: Upgrade to UCX 1.14 Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9 Signed-off-by: Joseph Moore --- ci/provisioning/post_provision_config_nodes_EL_8.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/provisioning/post_provision_config_nodes_EL_8.sh b/ci/provisioning/post_provision_config_nodes_EL_8.sh index 4a212cfe38b..0a07fc6a3e9 100644 --- a/ci/provisioning/post_provision_config_nodes_EL_8.sh +++ b/ci/provisioning/post_provision_config_nodes_EL_8.sh @@ -55,6 +55,7 @@ install_mofed() { gversion="${gversion%.*}" fi + time dnf -y install ucx ucx-cma ucx-ib ucx-rdmacm # Add a repo to install MOFED RPMS repo_url=https://artifactory.dc.hpdd.intel.com/artifactory/mlnx_ofed/"$MLNX_VER_NUM-rhel$gversion"-x86_64/ dnf -y config-manager --add-repo="$repo_url" From ba14a0604656281142fe682eebe14faab79a0356 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 31 May 2023 14:39:37 +0000 Subject: [PATCH 05/14] DAOS-13386 cart: Upgrade to UCX 1.14. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:2 Signed-off-by: Joseph Moore --- .../post_provision_config_common_functions.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index afe7b9c70ca..260ec877270 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -273,11 +273,6 @@ post_provision_config_nodes() { return 1 fi - if lspci | grep "ConnectX-6" && ! grep MOFED_VERSION /etc/do-release; then - # Remove OPA and install MOFED - install_mofed - fi - if [ -n "$INST_REPOS" ]; then local repo for repo in $INST_REPOS; do @@ -309,6 +304,11 @@ post_provision_config_nodes() { fi fi + if lspci | grep "ConnectX-6" && ! grep MOFED_VERSION /etc/do-release; then + # Remove OPA and install MOFED + install_mofed + fi + # shellcheck disable=SC2001 if ! rpm -q "$(echo "$INST_RPMS" | sed -e 's/--exclude [^ ]*//' \ From 5d48af36c69fbfc02e0e46b0d23650ecd00c5682 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 31 May 2023 15:42:22 +0000 Subject: [PATCH 06/14] DAOS-13386 cart: Test with UCX 1.14 debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:3 Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index e3f7fd8f090..1c3a47cb360 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -307,6 +307,9 @@ static int data_init(int server, crt_init_options_t *opt) if (server) setenv("UCX_IB_FORK_INIT", "n", 1); + setenv("D_LOG_STDERR_IN_LOG", "1", 1); + setenv("UCX_DC_MLX5_NUM_DCI", "16", 1); + /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); if (fi_univ_size == 0) { From 18e26a7faa6dc6c8ca3982ed79d02b1b868380a0 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 2 Jun 2023 17:47:49 +0000 Subject: [PATCH 07/14] DAOS-12286 cart: Execute CI tests with debug build of the UCX libraries. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:3 Signed-off-by: Joseph Moore From 9a516b02d08c81ee2975754484ec17e61e5a6da7 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 7 Jun 2023 00:00:12 +0000 Subject: [PATCH 08/14] DAOS-13386 cart: Use UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:2 Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 1c3a47cb360..36e3b489b11 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -308,6 +308,7 @@ static int data_init(int server, crt_init_options_t *opt) setenv("UCX_IB_FORK_INIT", "n", 1); setenv("D_LOG_STDERR_IN_LOG", "1", 1); + setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); setenv("UCX_DC_MLX5_NUM_DCI", "16", 1); /* This is a workaround for CART-871 if universe size is not set */ From b6b14cbe10a67ae18ef3f75a530842942544e38a Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 7 Jun 2023 00:03:01 +0000 Subject: [PATCH 09/14] DAOS-13886 cart: Test with UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:3 Required-githooks: true Signed-off-by: Joseph Moore From 3a5be405dd7dc3d9e24f06080cdc277e28ec2c75 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 7 Jun 2023 17:11:32 +0000 Subject: [PATCH 10/14] DAOS-13386 cart: Run with UCX debug build. Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/mdtest/small.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tests/ftest/mdtest/small.yaml b/src/tests/ftest/mdtest/small.yaml index 94c5c93c8a7..b2f2f821a14 100644 --- a/src/tests/ftest/mdtest/small.yaml +++ b/src/tests/ftest/mdtest/small.yaml @@ -5,7 +5,6 @@ timeout: 360 server_config: name: daos_server engines_per_host: 2 - crt_timeout: 60 engines: 0: pinned_numa_node: 0 From f94bc0cdc15e7798bb03692eb3ad50a47139b957 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 13 Jun 2023 17:09:53 +0000 Subject: [PATCH 11/14] DAOS-13386 cart: Test with UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+rc_x PR-repos: openucx@PR-9:3 Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/launch.py | 2 +- src/tests/ftest/util/network_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 719295155db..5acd20ca12d 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -60,7 +60,7 @@ [ ("cxi", "ofi+cxi"), ("verbs", "ofi+verbs"), - ("ucx", "ucx+dc_x"), + ("ucx", "ucx+rc_x"), ("tcp", "ofi+tcp"), ("opx", "ofi+opx"), ] diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index 711ddab65fc..80969e5588b 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -13,7 +13,7 @@ from exception_utils import CommandFailure from general_utils import run_task, display_task, run_pcmd -SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+dc_x", "ofi+cxi") +SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+rc_x", "ofi+cxi") class NetworkDevice(): From c2d70ed9cca515813e34d3fd3df092afafb9dfc1 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 13 Jun 2023 17:42:37 +0000 Subject: [PATCH 12/14] DAOS-13386 cart: Test with UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+rc_x PR-repos: openucx@PR-9:3 Required-githooks: true Signed-off-by: Joseph Moore --- src/cart/crt_init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 0f7c39edfb4..13c4f66874a 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -309,7 +309,6 @@ static int data_init(int server, crt_init_options_t *opt) setenv("D_LOG_STDERR_IN_LOG", "1", 1); setenv("UCX_SOCKADDR_TLS_PRIORITY", "rdmacm", 1); - setenv("UCX_DC_MLX5_NUM_DCI", "16", 1); /* This is a workaround for CART-871 if universe size is not set */ d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); From 7af02a93df93621d35a659be5c532173367dbbdf Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 14 Jun 2023 14:19:41 +0000 Subject: [PATCH 13/14] DAOS-13386 cart: Run with UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+ud_x PR-repos: openucx@PR-9:3 Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/launch.py | 2 +- src/tests/ftest/util/network_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 5acd20ca12d..b73698e9b4a 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -60,7 +60,7 @@ [ ("cxi", "ofi+cxi"), ("verbs", "ofi+verbs"), - ("ucx", "ucx+rc_x"), + ("ucx", "ucx+ud_x"), ("tcp", "ofi+tcp"), ("opx", "ofi+opx"), ] diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index 80969e5588b..7cc57802252 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -13,7 +13,7 @@ from exception_utils import CommandFailure from general_utils import run_task, display_task, run_pcmd -SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+rc_x", "ofi+cxi") +SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+ud_x", "ofi+cxi") class NetworkDevice(): From 6749cd1bcf1e278ecaa8fc3a60f851375a582c46 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 27 Jun 2023 17:27:54 +0000 Subject: [PATCH 14/14] DAOS-13386 ucx: Run with UCX debug build. Skip-build-leap15-icc: true Skip-build-el8-gcc: true Skip-build-ubuntu20-rpm: true Skip-build-leap15-rpm: true Skip-func-test-vm: true Skip-unit-tests: true Skip-func-hw-test-medium-ucx-provider: false Skip-func-hw-test-medium-verbs-provider: true Test-provider: ucx+dc_x PR-repos: openucx@PR-9:15 Required-githooks: true Signed-off-by: Joseph Moore --- src/tests/ftest/launch.py | 2 +- src/tests/ftest/util/network_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index b73698e9b4a..719295155db 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -60,7 +60,7 @@ [ ("cxi", "ofi+cxi"), ("verbs", "ofi+verbs"), - ("ucx", "ucx+ud_x"), + ("ucx", "ucx+dc_x"), ("tcp", "ofi+tcp"), ("opx", "ofi+opx"), ] diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index 7cc57802252..711ddab65fc 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -13,7 +13,7 @@ from exception_utils import CommandFailure from general_utils import run_task, display_task, run_pcmd -SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+ud_x", "ofi+cxi") +SUPPORTED_PROVIDERS = ("ofi+sockets", "ofi+tcp;ofi_rxm", "ofi+verbs;ofi_rxm", "ucx+dc_x", "ofi+cxi") class NetworkDevice():