diff --git a/Makefile b/Makefile index 59a60ba..9126072 100644 --- a/Makefile +++ b/Makefile @@ -101,6 +101,10 @@ al2023arm: check-region init validate release-al2023.auto.pkrvars.hcl al2023neu: check-region init validate release-al2023.auto.pkrvars.hcl ./packer build -only="amazon-ebs.al2023neu" -var "region=${REGION}" . +.PHONY: al2023gpu +al2023gpu: check-region init validate release-al2023.auto.pkrvars.hcl + ./packer build -only="amazon-ebs.al2023gpu" -var "region=${REGION}" . + shellcheck: curl -fLSs ${SHELLCHECK_URL} -o /tmp/shellcheck.tar.xz tar -xvf /tmp/shellcheck.tar.xz -C /tmp --strip-components=1 diff --git a/README.md b/README.md index d9b45ce..f1b178e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ It will create a private AMI in whatever account you are running it in. 1. Setup AWS cli credentials. 2. Make the recipe that you want, REGION must be specified. Options are: al1, al2, al2arm, al2gpu, al2keplergpu, al2inf, -al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu. +al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu, al2023gpu. ``` REGION=us-west-2 make al2 ``` diff --git a/al2023.pkr.hcl b/al2023.pkr.hcl index f0df354..2097b72 100644 --- a/al2023.pkr.hcl +++ b/al2023.pkr.hcl @@ -48,7 +48,8 @@ build { sources = [ "source.amazon-ebs.al2023", "source.amazon-ebs.al2023arm", - "source.amazon-ebs.al2023neu" + "source.amazon-ebs.al2023neu", + "source.amazon-ebs.al2023gpu" ] provisioner "file" { @@ -178,6 +179,13 @@ build { script = "scripts/enable-ecs-agent-inferentia-support.sh" } + provisioner "shell" { + environment_vars = [ + "AMI_TYPE=${source.name}" + ] + script = "scripts/enable-ecs-agent-gpu-support-al2023.sh" + } + provisioner "shell" { inline_shebang = "/bin/sh -ex" inline = [ diff --git a/al2023gpu.pkr.hcl b/al2023gpu.pkr.hcl new file mode 100644 index 0000000..63bc31c --- /dev/null +++ b/al2023gpu.pkr.hcl @@ -0,0 +1,40 @@ +locals { + ami_name_al2023gpu = "${var.ami_name_prefix_al2023}-gpu-hvm-2023.0.${var.ami_version_al2023}${var.kernel_version_al2023}-x86_64-ebs" + default_tags = { + os_version = "Amazon Linux 2023" + source_image_name = "{{ .SourceAMIName }}" + ecs_runtime_version = "Docker version ${var.docker_version_al2023}" + ecs_agent_version = "${var.ecs_agent_version}" + ami_type = "al2023gpu" + ami_version = "2023.0.${var.ami_version_al2023}" + } + merged_tags = merge("${local.default_tags}", "${var.tags}") +} + +source "amazon-ebs" "al2023gpu" { + ami_name = "${local.ami_name_al2023gpu}" + ami_description = "Amazon Linux AMI 2023.0.${var.ami_version_al2023} x86_64 ECS HVM EBS" + instance_type = var.gpu_instance_types[0] + launch_block_device_mappings { + volume_size = var.block_device_size_gb + delete_on_termination = true + volume_type = "gp3" + device_name = "/dev/xvda" + } + region = var.region + source_ami_filter { + filters = { + name = "${var.source_ami_al2023}" + } + owners = ["amazon"] + most_recent = true + include_deprecated = true + } + ami_ou_arns = "${var.ami_ou_arns}" + ami_org_arns = "${var.ami_org_arns}" + ami_users = "${var.ami_users}" + ssh_interface = "public_ip" + ssh_username = "ec2-user" + tags = "${local.merged_tags}" + run_tags = "${var.run_tags}" +} diff --git a/scripts/enable-ecs-agent-gpu-support-al2023.sh b/scripts/enable-ecs-agent-gpu-support-al2023.sh new file mode 100644 index 0000000..500d753 --- /dev/null +++ b/scripts/enable-ecs-agent-gpu-support-al2023.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -ex + +if [[ $AMI_TYPE != "al2023"*"gpu" ]]; then + exit 0 +fi + +# AL2023 GPU setup +sudo dnf install -y dkms kernel-modules-extra +sudo systemctl enable --now dkms + +# nvidia-release creates an nvidia repo file at /etc/yum.repos.d/amazonlinux-nvidia.repo +# docker-runtime-nvidia is a custom Amazon Linux package that sets the gpu container runtime to nvidia +# instead of having to use 'nvidia-ctk runtime configure --runtime=docker' +sudo dnf install -y nvidia-release +sudo dnf install -y nvidia-driver \ + nvidia-fabric-manager \ + pciutils \ + xorg-x11-server-Xorg \ + nvidia-container-toolkit \ + docker-runtime-nvidia + +sudo dnf install -y cuda + +# The Fabric Manager service needs to be started and enabled on EC2 P4d instances +# in order to configure NVLinks and NVSwitches +sudo systemctl enable nvidia-fabricmanager + +# NVIDIA Persistence Daemon needs to be started and enabled on P5 instances +# to maintain persistent software state in the NVIDIA driver. +sudo systemctl enable nvidia-persistenced + +# Enable GPU support for ECS +mkdir -p /tmp/ecs +echo 'ECS_ENABLE_GPU_SUPPORT=true' >/tmp/ecs/ecs.config +sudo mv /tmp/ecs/ecs.config /var/lib/ecs/ecs.config