Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introducing AL2023 GPU AMIs #370

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ al2023arm: check-region init validate release-al2023.auto.pkrvars.hcl
al2023neu: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023neu" -var "region=${REGION}" .

.PHONY: al2023gpu
al2023gpu: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023gpu" -var "region=${REGION}" .

shellcheck:
curl -fLSs ${SHELLCHECK_URL} -o /tmp/shellcheck.tar.xz
tar -xvf /tmp/shellcheck.tar.xz -C /tmp --strip-components=1
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ It will create a private AMI in whatever account you are running it in.

1. Setup AWS cli credentials.
2. Make the recipe that you want, REGION must be specified. Options are: al1, al2, al2arm, al2gpu, al2keplergpu, al2inf,
al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu.
al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu, al2023gpu.
```
REGION=us-west-2 make al2
```
Expand Down
10 changes: 9 additions & 1 deletion al2023.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ build {
sources = [
"source.amazon-ebs.al2023",
"source.amazon-ebs.al2023arm",
"source.amazon-ebs.al2023neu"
"source.amazon-ebs.al2023neu",
"source.amazon-ebs.al2023gpu"
]

provisioner "file" {
Expand Down Expand Up @@ -173,6 +174,13 @@ build {
script = "scripts/enable-ecs-agent-inferentia-support.sh"
}

provisioner "shell" {
environment_vars = [
"AMI_TYPE=${source.name}"
]
script = "scripts/enable-ecs-agent-gpu-support-al2023.sh"
}

provisioner "shell" {
inline_shebang = "/bin/sh -ex"
inline = [
Expand Down
40 changes: 40 additions & 0 deletions al2023gpu.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
locals {
ami_name_al2023gpu = "${var.ami_name_prefix_al2023}-gpu-hvm-2023.0.${var.ami_version_al2023}${var.kernel_version_al2023}-x86_64-ebs"
default_tags = {
os_version = "Amazon Linux 2023"
source_image_name = "{{ .SourceAMIName }}"
ecs_runtime_version = "Docker version ${var.docker_version_al2023}"
ecs_agent_version = "${var.ecs_agent_version}"
ami_type = "al2023gpu"
ami_version = "2023.0.${var.ami_version_al2023}"
}
merged_tags = merge("${local.default_tags}", "${var.tags}")
}

source "amazon-ebs" "al2023gpu" {
ami_name = "${local.ami_name_al2023gpu}"
ami_description = "Amazon Linux AMI 2023.0.${var.ami_version_al2023} x86_64 ECS HVM EBS"
instance_type = var.gpu_instance_types[0]
launch_block_device_mappings {
volume_size = var.block_device_size_gb
delete_on_termination = true
volume_type = "gp3"
device_name = "/dev/xvda"
}
region = var.region
source_ami_filter {
filters = {
name = "${var.source_ami_al2023}"
}
owners = ["amazon"]
most_recent = true
include_deprecated = true
}
ami_ou_arns = "${var.ami_ou_arns}"
ami_org_arns = "${var.ami_org_arns}"
ami_users = "${var.ami_users}"
ssh_interface = "public_ip"
ssh_username = "ec2-user"
tags = "${local.merged_tags}"
run_tags = "${var.run_tags}"
}
36 changes: 36 additions & 0 deletions scripts/enable-ecs-agent-gpu-support-al2023.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -ex

if [[ $AMI_TYPE != "al2023"*"gpu" ]]; then
exit 0
fi

# AL2023 GPU setup
sudo dnf install -y dkms kernel-modules-extra
sudo systemctl enable --now dkms

# nvidia-release creates an nvidia repo file at /etc/yum.repos.d/amazonlinux-nvidia.repo
# docker-runtime-nvidia is a custom Amazon Linux package that sets the gpu container runtime to nvidia
# instead of having to use 'nvidia-ctk runtime configure --runtime=docker'
sudo dnf install -y nvidia-release
harishxr marked this conversation as resolved.
Show resolved Hide resolved
harishxr marked this conversation as resolved.
Show resolved Hide resolved
sudo dnf install -y nvidia-driver \
nvidia-fabric-manager \
pciutils \
xorg-x11-server-Xorg \
harishxr marked this conversation as resolved.
Show resolved Hide resolved
nvidia-container-toolkit \
docker-runtime-nvidia
harishxr marked this conversation as resolved.
Show resolved Hide resolved

sudo dnf install -y cuda
harishxr marked this conversation as resolved.
Show resolved Hide resolved

# The Fabric Manager service needs to be started and enabled on EC2 P4d instances
# in order to configure NVLinks and NVSwitches
sudo systemctl enable nvidia-fabricmanager

# NVIDIA Persistence Daemon needs to be started and enabled on P5 instances
harishxr marked this conversation as resolved.
Show resolved Hide resolved
# to maintain persistent software state in the NVIDIA driver.
sudo systemctl enable nvidia-persistenced

# Enable GPU support for ECS
mkdir -p /tmp/ecs
echo 'ECS_ENABLE_GPU_SUPPORT=true' >/tmp/ecs/ecs.config
sudo mv /tmp/ecs/ecs.config /var/lib/ecs/ecs.config
Loading