From 0876b58b6077f1faeb06e7a74f74ce3926ff177e Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sun, 10 Mar 2024 11:56:09 +0900 Subject: [PATCH 1/3] Update 0.nvcr-pytorch-aws.dockerfile bump to AWS_OFI_NCCL_VERSION=1.8.1-aws --- .../containers/pytorch/0.nvcr-pytorch-aws.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 4310b79e..6f446e3e 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -25,7 +25,7 @@ ENV DEBIAN_FRONTEND=noninteractive # The three must-be-built packages. # Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error. ENV EFA_INSTALLER_VERSION=1.30.0 -ENV AWS_OFI_NCCL_VERSION=1.7.4-aws +ENV AWS_OFI_NCCL_VERSION=1.8.1-aws ENV NCCL_TESTS_VERSION=master RUN apt-get update -y From 25b6b082b45a78f40982e606cc3b24cd8c10fc0a Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sun, 10 Mar 2024 11:57:36 +0900 Subject: [PATCH 2/3] Update 0.llm-foundry.Dockerfile --- 3.test_cases/3.MPT/0.llm-foundry.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3.test_cases/3.MPT/0.llm-foundry.Dockerfile b/3.test_cases/3.MPT/0.llm-foundry.Dockerfile index 46ac3447..123d636b 100644 --- a/3.test_cases/3.MPT/0.llm-foundry.Dockerfile +++ b/3.test_cases/3.MPT/0.llm-foundry.Dockerfile @@ -1,7 +1,7 @@ FROM mosaicml/llm-foundry:2.2.0_cu121_flash2_aws-latest ARG EFA_INSTALLER_VERSION=1.30.0 -ARG AWS_OFI_NCCL_VERSION=v1.7.4-aws +ARG AWS_OFI_NCCL_VERSION=1.8.1-aws ARG NCCL_TESTS_VERSION=master ARG NCCL_VERSION=v2.18.6-1 ARG LLM_FOUNDRY_VERSION=v0.4.0 From 33ad5e0ec4b033efc1db1959bfabc731e3ea175e Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sun, 10 Mar 2024 11:58:04 +0900 Subject: [PATCH 3/3] Update 0.tensorflow.Dockerfile --- 3.test_cases/7.tensorflow-distributed/0.tensorflow.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3.test_cases/7.tensorflow-distributed/0.tensorflow.Dockerfile b/3.test_cases/7.tensorflow-distributed/0.tensorflow.Dockerfile index f91797df..03eaabbd 100644 --- a/3.test_cases/7.tensorflow-distributed/0.tensorflow.Dockerfile +++ b/3.test_cases/7.tensorflow-distributed/0.tensorflow.Dockerfile @@ -1,7 +1,7 @@ FROM nvcr.io/nvidia/tensorflow:23.10-tf2-py3 ARG EFA_INSTALLER_VERSION=1.30.0 -ARG AWS_OFI_NCCL_VERSION=v1.7.4-aws +ARG AWS_OFI_NCCL_VERSION=1.8.1-aws ARG NCCL_TESTS_VERSION=master ARG NCCL_VERSION=v2.18.6-1 ARG OPEN_MPI_PATH=/opt/amazon/openmpi