Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add AL2023 GPU AMI #362

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ al2kernel5dot10inf: check-region init validate release-al2.auto.pkrvars.hcl
al2023: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023" -var "region=${REGION}" .

.PHONY: al2023gpu
al2023gpu: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023gpu" -var "region=${REGION}" .

.PHONY: al2023arm
al2023arm: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023arm" -var "region=${REGION}" .
Expand Down
8 changes: 8 additions & 0 deletions al2023.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ source "amazon-ebs" "al2023" {
build {
sources = [
"source.amazon-ebs.al2023",
"source.amazon-ebs.al2023gpu",
"source.amazon-ebs.al2023arm",
"source.amazon-ebs.al2023neu"
]
Expand Down Expand Up @@ -172,6 +173,13 @@ build {
script = "scripts/enable-ecs-agent-inferentia-support.sh"
}

provisioner "shell" {
environment_vars = [
"AMI_TYPE=${source.name}"
]
script = "scripts/enable-ecs-agent-gpu-support.sh"
}

provisioner "shell" {
inline_shebang = "/bin/sh -ex"
inline = [
Expand Down
40 changes: 40 additions & 0 deletions al2023gpu.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
locals {
ami_name_al2023gpu = "${var.ami_name_prefix_al2023}-gpu-hvm-2023.0.${var.ami_version_al2023}${var.kernel_version_al2023}-x86_64"
default_tags = {
os_version = "Amazon Linux 2023"
source_image_name = "{{ .SourceAMIName }}"
ecs_runtime_version = "Docker version ${var.docker_version_al2023}"
ecs_agent_version = "${var.ecs_agent_version}"
ami_type = "al2023gpu"
ami_version = "2023.0.${var.ami_version_al2023}"
}
merged_tags = merge("${local.default_tags}", "${var.tags}")
}

source "amazon-ebs" "al2023gpu" {
ami_name = "${local.ami_name_al2023gpu}"
ami_description = "Amazon Linux AMI 2023.0.${var.ami_version_al2023} x86_64 ECS GPU HVM EBS"
instance_type = var.gpu_instance_types[0]
launch_block_device_mappings {
volume_size = var.block_device_size_gb
delete_on_termination = true
volume_type = "gp3"
device_name = "/dev/xvda"
}
region = var.region
source_ami_filter {
filters = {
name = "${var.source_ami_al2023}"
}
owners = ["amazon"]
most_recent = true
include_deprecated = true
}
ami_ou_arns = "${var.ami_ou_arns}"
ami_org_arns = "${var.ami_org_arns}"
ami_users = "${var.ami_users}"
ssh_interface = "public_ip"
ssh_username = "ec2-user"
tags = "${local.merged_tags}"
run_tags = "${var.run_tags}"
}
71 changes: 55 additions & 16 deletions scripts/enable-ecs-agent-gpu-support.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,34 @@ if [[ $AMI_TYPE != "al2"*"gpu" ]]; then
exit 0
fi

# set up amzn2-nvidia repo
GPG_CHECK=1
# don't do the gpg check in air-gapped regions
if [ -n "$AIR_GAPPED" ]; then
GPG_CHECK=0
fi
tmpfile=$(mktemp)
cat >$tmpfile <<EOF
if [[ $AMI_TYPE == "al2023"*"gpu" ]]; then
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-yum-or-dnf
tmpfile=$(mktemp)
cat >$tmpfile <<"EOF"
[nvidia-container-toolkit]
name=nvidia-container-toolkit
baseurl=https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=1
gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
EOF
sudo mv $tmpfile "/etc/yum.repos.d/nvidia-container-toolkit.repo"

# https://github.com/aws/amazon-ecs-ami/issues/319#issuecomment-2471834667
sudo dnf install -y nvidia-release
sudo dnf clean all
else
# set up amzn2-nvidia repo
GPG_CHECK=1
# don't do the gpg check in air-gapped regions
if [ -n "$AIR_GAPPED" ]; then
GPG_CHECK=0
fi
tmpfile=$(mktemp)
cat >$tmpfile <<EOF
[amzn2-nvidia]
name=Amazon Linux 2 Nvidia repository
mirrorlist=\$awsproto://\$amazonlinux.\$awsregion.\$awsdomain/\$releasever/amzn2-nvidia/latest/\$basearch/mirror.list
Expand All @@ -33,14 +53,15 @@ enabled=1
exclude=libglvnd-*
EOF

DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive
DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive

# the amzn2-nvidia repo is temporary and only used for installing the system-release-nvidia package
sudo mv $tmpfile /etc/yum.repos.d/amzn2-nvidia-tmp.repo
# the amzn2-nvidia repo is temporary and only used for installing the system-release-nvidia package
sudo mv $tmpfile /etc/yum.repos.d/amzn2-nvidia-tmp.repo
fi

# only install open driver for post-kepler gpus, exclude airgapped regions
if [[ $AMI_TYPE != "al2keplergpu" && -z ${AIR_GAPPED} ]]; then
if [[ $AMI_TYPE != "al2keplergpu" && $AMI_TYPE != "al2023"*"gpu" && -z ${AIR_GAPPED} ]]; then
sudo yum install -y yum-plugin-versionlock yum-utils
sudo amazon-linux-extras install epel -y
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
Expand Down Expand Up @@ -108,9 +129,11 @@ EOF
sudo chmod +x /var/lib/ecs/scripts/install-nvidia-open-kmod.sh
fi

# system-release-nvidia creates an nvidia repo file at /etc/yum.repos.d/amzn2-nvidia.repo
sudo yum install -y system-release-nvidia
sudo rm /etc/yum.repos.d/amzn2-nvidia-tmp.repo
if [[ $AMI_TYPE == "al2"*"gpu" && $AMI_TYPE != "al2023"*"gpu" ]]; then
# system-release-nvidia creates an nvidia repo file at /etc/yum.repos.d/amzn2-nvidia.repo
sudo yum install -y system-release-nvidia
sudo rm /etc/yum.repos.d/amzn2-nvidia-tmp.repo
fi

# for building AMIs for GPUs with Kepler architecture, fix package versions
# also exclude nvidia and cuda packages to update. Newer Nvidia drivers do not support Kepler architecture
Expand All @@ -130,6 +153,22 @@ if [[ $AMI_TYPE == "al2keplergpu" ]]; then

sudo yum install -y cuda-toolkit-11-4
echo "exclude=*nvidia* *cuda*" | sudo tee -a /etc/yum.conf
elif [[ $AMI_TYPE == "al2023"*"gpu" ]]; then
kernel_release=$(uname -r)

sudo dnf install -y kernel-devel-"$kernel_release" \
kernel-modules-extra-"$kernel_release" \
nvidia-driver \
nvidia-fabric-manager \
libnvidia-container1 \
libnvidia-container-tools \
nvidia-container-toolkit

# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
sudo nvidia-ctk runtime configure --runtime=docker
giantcow marked this conversation as resolved.
Show resolved Hide resolved
sudo nvidia-ctk runtime configure --runtime=containerd
sudo systemctl restart docker
sudo systemctl restart containerd
else
# Default GPU AMI
sudo yum install -y kernel-devel-$(uname -r) \
Expand Down