From 2a98fd281cc3245f25cac3a40f423fe88a5da9dc Mon Sep 17 00:00:00 2001 From: Kayo Date: Tue, 7 Jan 2025 04:53:20 +0000 Subject: [PATCH 1/3] feat: Add AL2023 GPU AMI fixes #319 --- Makefile | 4 ++ al2023.pkr.hcl | 8 +++ al2023gpu.pkr.hcl | 40 +++++++++++++ scripts/enable-ecs-agent-gpu-support.sh | 75 +++++++++++++++++++------ 4 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 al2023gpu.pkr.hcl diff --git a/Makefile b/Makefile index 59a60bae..29f63a72 100644 --- a/Makefile +++ b/Makefile @@ -93,6 +93,10 @@ al2kernel5dot10inf: check-region init validate release-al2.auto.pkrvars.hcl al2023: check-region init validate release-al2023.auto.pkrvars.hcl ./packer build -only="amazon-ebs.al2023" -var "region=${REGION}" . +.PHONY: al2023gpu +al2023gpu: check-region init validate release-al2023.auto.pkrvars.hcl + ./packer build -only="amazon-ebs.al2023gpu" -var "region=${REGION}" . + .PHONY: al2023arm al2023arm: check-region init validate release-al2023.auto.pkrvars.hcl ./packer build -only="amazon-ebs.al2023arm" -var "region=${REGION}" . diff --git a/al2023.pkr.hcl b/al2023.pkr.hcl index b4041ded..8ad2967b 100644 --- a/al2023.pkr.hcl +++ b/al2023.pkr.hcl @@ -42,6 +42,7 @@ source "amazon-ebs" "al2023" { build { sources = [ "source.amazon-ebs.al2023", + "source.amazon-ebs.al2023gpu", "source.amazon-ebs.al2023arm", "source.amazon-ebs.al2023neu" ] @@ -172,6 +173,13 @@ build { script = "scripts/enable-ecs-agent-inferentia-support.sh" } + provisioner "shell" { + environment_vars = [ + "AMI_TYPE=${source.name}" + ] + script = "scripts/enable-ecs-agent-gpu-support.sh" + } + provisioner "shell" { inline_shebang = "/bin/sh -ex" inline = [ diff --git a/al2023gpu.pkr.hcl b/al2023gpu.pkr.hcl new file mode 100644 index 00000000..64c5f524 --- /dev/null +++ b/al2023gpu.pkr.hcl @@ -0,0 +1,40 @@ +locals { + ami_name_al2023gpu = "${var.ami_name_prefix_al2023}-gpu-hvm-2023.0.${var.ami_version_al2023}${var.kernel_version_al2023}-x86_64" + default_tags = { + os_version = "Amazon Linux 2023" + source_image_name = "{{ .SourceAMIName }}" + ecs_runtime_version = "Docker version ${var.docker_version_al2023}" + ecs_agent_version = "${var.ecs_agent_version}" + ami_type = "al2023gpu" + ami_version = "2023.0.${var.ami_version_al2023}" + } + merged_tags = merge("${local.default_tags}", "${var.tags}") +} + +source "amazon-ebs" "al2023gpu" { + ami_name = "${local.ami_name_al2023gpu}" + ami_description = "Amazon Linux AMI 2023.0.${var.ami_version_al2023} x86_64 ECS GPU HVM EBS" + instance_type = var.gpu_instance_types[0] + launch_block_device_mappings { + volume_size = var.block_device_size_gb + delete_on_termination = true + volume_type = "gp3" + device_name = "/dev/xvda" + } + region = var.region + source_ami_filter { + filters = { + name = "${var.source_ami_al2023}" + } + owners = ["amazon"] + most_recent = true + include_deprecated = true + } + ami_ou_arns = "${var.ami_ou_arns}" + ami_org_arns = "${var.ami_org_arns}" + ami_users = "${var.ami_users}" + ssh_interface = "public_ip" + ssh_username = "ec2-user" + tags = "${local.merged_tags}" + run_tags = "${var.run_tags}" +} diff --git a/scripts/enable-ecs-agent-gpu-support.sh b/scripts/enable-ecs-agent-gpu-support.sh index 9677eb88..ea52c983 100644 --- a/scripts/enable-ecs-agent-gpu-support.sh +++ b/scripts/enable-ecs-agent-gpu-support.sh @@ -15,14 +15,34 @@ if [[ $AMI_TYPE != "al2"*"gpu" ]]; then exit 0 fi -# set up amzn2-nvidia repo -GPG_CHECK=1 -# don't do the gpg check in air-gapped regions -if [ -n "$AIR_GAPPED" ]; then - GPG_CHECK=0 -fi -tmpfile=$(mktemp) -cat >$tmpfile <$tmpfile <<"EOF" +[nvidia-container-toolkit] +name=nvidia-container-toolkit +baseurl=https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch +repo_gpgcheck=1 +gpgcheck=0 +enabled=1 +gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey +sslverify=1 +sslcacert=/etc/pki/tls/certs/ca-bundle.crt +EOF + sudo mv $tmpfile "/etc/yum.repos.d/nvidia-container-toolkit.repo" + + # https://github.com/aws/amazon-ecs-ami/issues/319#issuecomment-2471834667 + sudo dnf install -y nvidia-release + sudo dnf clean all +else + # set up amzn2-nvidia repo + GPG_CHECK=1 + # don't do the gpg check in air-gapped regions + if [ -n "$AIR_GAPPED" ]; then + GPG_CHECK=0 + fi + tmpfile=$(mktemp) + cat >$tmpfile < Date: Thu, 9 Jan 2025 21:24:23 +0000 Subject: [PATCH 2/3] fix(al2023gpu): Remove cuda and unneeded packages --- scripts/enable-ecs-agent-gpu-support.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/enable-ecs-agent-gpu-support.sh b/scripts/enable-ecs-agent-gpu-support.sh index ea52c983..35c7ce23 100644 --- a/scripts/enable-ecs-agent-gpu-support.sh +++ b/scripts/enable-ecs-agent-gpu-support.sh @@ -160,16 +160,10 @@ elif [[ $AMI_TYPE == "al2023"*"gpu" ]]; then kernel-modules-extra-"$kernel_release" \ nvidia-driver \ nvidia-fabric-manager \ - pciutils \ - xorg-x11-server-Xorg \ - oci-add-hooks \ libnvidia-container1 \ libnvidia-container-tools \ nvidia-container-toolkit - sudo dnf install -y cuda-drivers \ - cuda - # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker From 0c5b39ed327e97f1cef69a6ad1e56948e4a27824 Mon Sep 17 00:00:00 2001 From: Kayo Date: Thu, 9 Jan 2025 21:24:49 +0000 Subject: [PATCH 3/3] feat(al2023gpu): Add containerd config --- scripts/enable-ecs-agent-gpu-support.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/enable-ecs-agent-gpu-support.sh b/scripts/enable-ecs-agent-gpu-support.sh index 35c7ce23..0330f652 100644 --- a/scripts/enable-ecs-agent-gpu-support.sh +++ b/scripts/enable-ecs-agent-gpu-support.sh @@ -164,9 +164,11 @@ elif [[ $AMI_TYPE == "al2023"*"gpu" ]]; then libnvidia-container-tools \ nvidia-container-toolkit - # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html sudo nvidia-ctk runtime configure --runtime=docker + sudo nvidia-ctk runtime configure --runtime=containerd sudo systemctl restart docker + sudo systemctl restart containerd else # Default GPU AMI sudo yum install -y kernel-devel-$(uname -r) \