Skip to content

Commit

Permalink
Support AL2 kernel 5.10 GPU and INF
Browse files Browse the repository at this point in the history
  • Loading branch information
danehlim committed Feb 28, 2024
1 parent ba5782d commit 3c0411b
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 11 deletions.
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ al2kernel5dot10: check-region init validate release-al2.auto.pkrvars.hcl
al2kernel5dot10arm: check-region init validate release-al2.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2kernel5dot10arm" -var "region=${REGION}" .

.PHONY: al2kernel5dot10gpu
al2kernel5dot10gpu: check-region init validate release-al2.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2kernel5dot10gpu" -var "region=${REGION}" .

.PHONY: al2kernel5dot10inf
al2kernel5dot10inf: check-region init validate release-al2.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2kernel5dot10inf" -var "region=${REGION}" .

.PHONY: al2023
al2023: check-region init validate release-al2023.auto.pkrvars.hcl
./packer build -only="amazon-ebs.al2023" -var "region=${REGION}" .
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ It will create a private AMI in whatever account you are running it in.

1. Setup AWS cli credentials.
2. Make the recipe that you want, REGION must be specified. Options are: al1, al2, al2arm, al2gpu, al2keplergpu, al2inf,
al2kernel5dot10, al2kernel5dot10arm, al2023, al2023arm, al2023neu.
al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu.
```
REGION=us-west-2 make al2
```
Expand Down
17 changes: 14 additions & 3 deletions al2.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ build {
"source.amazon-ebs.al2keplergpu",
"source.amazon-ebs.al2inf",
"source.amazon-ebs.al2kernel5dot10",
"source.amazon-ebs.al2kernel5dot10arm"
"source.amazon-ebs.al2kernel5dot10arm",
"source.amazon-ebs.al2kernel5dot10gpu",
"source.amazon-ebs.al2kernel5dot10inf"
]

provisioner "file" {
Expand Down Expand Up @@ -174,12 +176,21 @@ build {

provisioner "shell" {
environment_vars = ["AMI_TYPE=${source.name}"]
script = "scripts/enable-ecs-agent-inferentia-support.sh"
script = "scripts/al2/install-kernel5dot10.sh"
}

### If necessary, reboot worker instance to install kernel update for enable-ecs-agent-inferentia-support or
### enable-ecs-agent-gpu-support scripts that factor in kernel version.
provisioner "shell" {
environment_vars = ["AMI_TYPE=${source.name}"]
expect_disconnect = "true"
script = "scripts/al2/reboot-for-kernel-upgrade.sh"
}

provisioner "shell" {
environment_vars = ["AMI_TYPE=${source.name}"]
script = "scripts/al2/install-kernel5dot10.sh"
pause_before = "10s" # pause for starting the reboot
script = "scripts/enable-ecs-agent-inferentia-support.sh"
}

provisioner "shell" {
Expand Down
33 changes: 33 additions & 0 deletions al2kernel5dot10gpu.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
locals {
ami_name_al2kernel5dot10gpu = "${var.ami_name_prefix_al2}-kernel-5.10-gpu-hvm-2.0.${var.ami_version_al2}-x86_64-ebs"
}

source "amazon-ebs" "al2kernel5dot10gpu" {
ami_name = "${local.ami_name_al2kernel5dot10gpu}"
ami_description = "Amazon Linux AMI 2.0.${var.ami_version_al2} Kernel 5.10 x86_64 ECS HVM GP2"
instance_type = var.gpu_instance_types[0]
launch_block_device_mappings {
volume_size = var.block_device_size_gb
delete_on_termination = true
volume_type = "gp2"
device_name = "/dev/xvda"
}
region = var.region
source_ami_filter {
filters = {
name = "${var.source_ami_al2kernel5dot10}"
}
owners = ["amazon"]
most_recent = true
}
ssh_interface = "public_ip"
ssh_username = "ec2-user"
tags = {
os_version = "Amazon Linux 2"
source_image_name = "{{ .SourceAMIName }}"
ecs_runtime_version = "Docker version ${var.docker_version}"
ecs_agent_version = "${var.ecs_agent_version}"
ami_type = "al2kernel5dot10gpu"
ami_version = "2.0.${var.ami_version_al2}"
}
}
34 changes: 34 additions & 0 deletions al2kernel5dot10inf.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
locals {
ami_name_al2kernel5dot10inf = "${var.ami_name_prefix_al2}-kernel-5.10-inf-hvm-2.0.${var.ami_version_al2}-x86_64-ebs"
}

source "amazon-ebs" "al2kernel5dot10inf" {
ami_name = "${local.ami_name_al2kernel5dot10inf}"
ami_description = "Amazon Linux AMI 2.0.${var.ami_version_al2} Kernel 5.10 x86_64 ECS HVM GP2"
instance_type = var.inf_instance_types[0]
launch_block_device_mappings {
volume_size = var.block_device_size_gb
delete_on_termination = true
volume_type = "gp2"
device_name = "/dev/xvda"
}
region = var.region
source_ami_filter {
filters = {
name = "${var.source_ami_al2kernel5dot10}"
}
owners = ["amazon"]
most_recent = true
}
ssh_interface = "public_ip"
ssh_username = "ec2-user"
tags = {
os_version = "Amazon Linux 2"
source_image_name = "{{ .SourceAMIName }}"
ecs_runtime_version = "Docker version ${var.docker_version}"
ecs_agent_version = "${var.ecs_agent_version}"
ami_type = "al2kernel5dot10inf"
ami_version = "2.0.${var.ami_version_al2}"
}
}

2 changes: 1 addition & 1 deletion scripts/al2/install-kernel5dot10.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# - Modify AL2 kernel 5.10 variables in generate-release-vars.sh to use SSM parameters of AL2 kernel 5.10 minimal AMIs
set -ex

if [[ $AMI_TYPE == "al2kernel5dot10" || $AMI_TYPE == "al2kernel5dot10arm" ]]; then
if [[ $AMI_TYPE == "al2kernel5dot10"* ]]; then
sudo amazon-linux-extras install -y kernel-5.10
sudo rpm -e kernel-4.*
fi
7 changes: 7 additions & 0 deletions scripts/al2/reboot-for-kernel-upgrade.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
# TO-DO: Disable/remove this script once Amazon Linux team has released AL2 kernel 5.10 minimal AMIs.
set -ex

if [[ $AMI_TYPE == "al2kernel5dot10gpu" || $AMI_TYPE == "al2kernel5dot10inf" ]]; then
sudo reboot
fi
27 changes: 23 additions & 4 deletions scripts/enable-ecs-agent-gpu-support.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
#!/usr/bin/env bash
set -ex

if [[ $AMI_TYPE != "al2gpu" && $AMI_TYPE != "al2keplergpu" ]]; then
# Makes sure that a compatible version of gcc is used for compiling NVIDIA driver.
set_compatible_gcc_version_for_nvidia_compile() {
# Currently a compatible version of gcc is assumed to be used by default, unless the AMI recipe uses kernel 5.10.
if [[ $AMI_TYPE == *"kernel5dot10gpu" ]]; then
# Explicitly use gcc10 since gcc version for compiling the NVIDIA driver must match gcc version with which the
# Linux kernel was compiled.
sudo sed -i "s/'make' -j2 module/& CC=\/usr\/bin\/gcc10-cc/" /usr/src/${MODULE_NAME}-${MODULE_VERSION}/dkms.conf
fi
}

if [[ $AMI_TYPE != "al2"*"gpu" ]]; then
exit 0
fi

Expand All @@ -23,6 +33,9 @@ enabled=1
exclude=libglvnd-*
EOF

DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive

# the amzn2-nvidia repo is temporary and only used for installing the system-release-nvidia package
sudo mv $tmpfile /etc/yum.repos.d/amzn2-nvidia-tmp.repo

Expand All @@ -45,11 +58,9 @@ if [[ $AMI_TYPE != "al2keplergpu" && -z ${AIR_GAPPED} ]]; then
sudo yum install -y nvidia-kmod-common-${NVIDIA_VERSION}

# build nvidia-open kmod tar
DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive
MODULE_NAME="nvidia-open"
MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:')

set_compatible_gcc_version_for_nvidia_compile
sudo ${DKMS} build -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
sudo ${DKMS} mktarball -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
sudo mkdir -p "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/"
Expand Down Expand Up @@ -137,6 +148,14 @@ else
cuda
fi

if [[ $AMI_TYPE == *"kernel5dot10gpu" ]]; then
# rebuild module/update drivers using compatible gcc version (gcc10)
MODULE_NAME="nvidia"
MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:')
set_compatible_gcc_version_for_nvidia_compile
sudo ${DKMS} install -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
fi

# The Fabric Manager service needs to be started and enabled on EC2 P4d instances
# in order to configure NVLinks and NVSwitches
sudo systemctl enable nvidia-fabricmanager
Expand Down
4 changes: 2 additions & 2 deletions scripts/enable-ecs-agent-inferentia-support.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
set -ex

if [[ $AMI_TYPE != "al2inf" && $AMI_TYPE != "al2023neu" ]]; then
if [[ $AMI_TYPE != "al2"*"inf" && $AMI_TYPE != "al2023neu" ]]; then
exit 0
fi

Expand Down Expand Up @@ -32,7 +32,7 @@ sudo yum install -y aws-neuronx-oci-hook-2.*

# Install oci-add-hooks
# TODO: oci-add-hooks package has compatibility issue with AL2023 IMDSv2. Remove condition after root caused and resolved
if [[ $AMI_TYPE == "al2inf" ]]; then
if [[ $AMI_TYPE == "al2"*"inf" ]]; then
sudo yum install -y oci-add-hooks
fi

Expand Down

0 comments on commit 3c0411b

Please sign in to comment.