diff --git a/Makefile b/Makefile index 55bae4f..59a60ba 100644 --- a/Makefile +++ b/Makefile @@ -81,6 +81,14 @@ al2kernel5dot10: check-region init validate release-al2.auto.pkrvars.hcl al2kernel5dot10arm: check-region init validate release-al2.auto.pkrvars.hcl ./packer build -only="amazon-ebs.al2kernel5dot10arm" -var "region=${REGION}" . +.PHONY: al2kernel5dot10gpu +al2kernel5dot10gpu: check-region init validate release-al2.auto.pkrvars.hcl + ./packer build -only="amazon-ebs.al2kernel5dot10gpu" -var "region=${REGION}" . + +.PHONY: al2kernel5dot10inf +al2kernel5dot10inf: check-region init validate release-al2.auto.pkrvars.hcl + ./packer build -only="amazon-ebs.al2kernel5dot10inf" -var "region=${REGION}" . + .PHONY: al2023 al2023: check-region init validate release-al2023.auto.pkrvars.hcl ./packer build -only="amazon-ebs.al2023" -var "region=${REGION}" . diff --git a/README.md b/README.md index 2cbf2a1..d9b45ce 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ It will create a private AMI in whatever account you are running it in. 1. Setup AWS cli credentials. 2. Make the recipe that you want, REGION must be specified. Options are: al1, al2, al2arm, al2gpu, al2keplergpu, al2inf, -al2kernel5dot10, al2kernel5dot10arm, al2023, al2023arm, al2023neu. +al2kernel5dot10, al2kernel5dot10arm, al2kernel5dot10gpu, al2kernel5dot10inf, al2023, al2023arm, al2023neu. ``` REGION=us-west-2 make al2 ``` diff --git a/al2.pkr.hcl b/al2.pkr.hcl index a276cb8..d9f1319 100644 --- a/al2.pkr.hcl +++ b/al2.pkr.hcl @@ -40,7 +40,9 @@ build { "source.amazon-ebs.al2keplergpu", "source.amazon-ebs.al2inf", "source.amazon-ebs.al2kernel5dot10", - "source.amazon-ebs.al2kernel5dot10arm" + "source.amazon-ebs.al2kernel5dot10arm", + "source.amazon-ebs.al2kernel5dot10gpu", + "source.amazon-ebs.al2kernel5dot10inf" ] provisioner "file" { @@ -174,12 +176,23 @@ build { provisioner "shell" { environment_vars = ["AMI_TYPE=${source.name}"] - script = "scripts/enable-ecs-agent-inferentia-support.sh" + script = "scripts/al2/install-kernel5dot10.sh" + } + + ### reboot worker instance to install kernel update. enable-ecs-agent-inferentia-support needs + ### new kernel (if there is) to be installed. + provisioner "shell" { + inline_shebang = "/bin/sh -ex" + expect_disconnect = "true" + inline = [ + "sudo reboot" + ] } provisioner "shell" { environment_vars = ["AMI_TYPE=${source.name}"] - script = "scripts/al2/install-kernel5dot10.sh" + pause_before = "10s" # pause for starting the reboot + script = "scripts/enable-ecs-agent-inferentia-support.sh" } provisioner "shell" { diff --git a/al2kernel5dot10gpu.pkr.hcl b/al2kernel5dot10gpu.pkr.hcl new file mode 100644 index 0000000..f2850c2 --- /dev/null +++ b/al2kernel5dot10gpu.pkr.hcl @@ -0,0 +1,33 @@ +locals { + ami_name_al2kernel5dot10gpu = "${var.ami_name_prefix_al2}-kernel-5.10-gpu-hvm-2.0.${var.ami_version_al2}-x86_64-ebs" +} + +source "amazon-ebs" "al2kernel5dot10gpu" { + ami_name = "${local.ami_name_al2kernel5dot10gpu}" + ami_description = "Amazon Linux AMI 2.0.${var.ami_version_al2} Kernel 5.10 x86_64 ECS HVM GP2" + instance_type = var.gpu_instance_types[0] + launch_block_device_mappings { + volume_size = var.block_device_size_gb + delete_on_termination = true + volume_type = "gp2" + device_name = "/dev/xvda" + } + region = var.region + source_ami_filter { + filters = { + name = "${var.source_ami_al2kernel5dot10}" + } + owners = ["amazon"] + most_recent = true + } + ssh_interface = "public_ip" + ssh_username = "ec2-user" + tags = { + os_version = "Amazon Linux 2" + source_image_name = "{{ .SourceAMIName }}" + ecs_runtime_version = "Docker version ${var.docker_version}" + ecs_agent_version = "${var.ecs_agent_version}" + ami_type = "al2kernel5dot10gpu" + ami_version = "2.0.${var.ami_version_al2}" + } +} diff --git a/al2kernel5dot10inf.pkr.hcl b/al2kernel5dot10inf.pkr.hcl new file mode 100644 index 0000000..82a2202 --- /dev/null +++ b/al2kernel5dot10inf.pkr.hcl @@ -0,0 +1,34 @@ +locals { + ami_name_al2kernel5dot10inf = "${var.ami_name_prefix_al2}-kernel-5.10-inf-hvm-2.0.${var.ami_version_al2}-x86_64-ebs" +} + +source "amazon-ebs" "al2kernel5dot10inf" { + ami_name = "${local.ami_name_al2kernel5dot10inf}" + ami_description = "Amazon Linux AMI 2.0.${var.ami_version_al2} Kernel 5.10 x86_64 ECS HVM GP2" + instance_type = var.inf_instance_types[0] + launch_block_device_mappings { + volume_size = var.block_device_size_gb + delete_on_termination = true + volume_type = "gp2" + device_name = "/dev/xvda" + } + region = var.region + source_ami_filter { + filters = { + name = "${var.source_ami_al2kernel5dot10}" + } + owners = ["amazon"] + most_recent = true + } + ssh_interface = "public_ip" + ssh_username = "ec2-user" + tags = { + os_version = "Amazon Linux 2" + source_image_name = "{{ .SourceAMIName }}" + ecs_runtime_version = "Docker version ${var.docker_version}" + ecs_agent_version = "${var.ecs_agent_version}" + ami_type = "al2kernel5dot10inf" + ami_version = "2.0.${var.ami_version_al2}" + } +} + diff --git a/scripts/al2/install-kernel5dot10.sh b/scripts/al2/install-kernel5dot10.sh index 4210ff7..2895925 100644 --- a/scripts/al2/install-kernel5dot10.sh +++ b/scripts/al2/install-kernel5dot10.sh @@ -4,7 +4,7 @@ # - Modify AL2 kernel 5.10 variables in generate-release-vars.sh to use SSM parameters of AL2 kernel 5.10 minimal AMIs set -ex -if [[ $AMI_TYPE == "al2kernel5dot10" || $AMI_TYPE == "al2kernel5dot10arm" ]]; then +if [[ $AMI_TYPE == "al2kernel5dot10"* ]]; then sudo amazon-linux-extras install -y kernel-5.10 sudo rpm -e kernel-4.* fi diff --git a/scripts/enable-ecs-agent-gpu-support.sh b/scripts/enable-ecs-agent-gpu-support.sh index b9c4251..116b1b5 100644 --- a/scripts/enable-ecs-agent-gpu-support.sh +++ b/scripts/enable-ecs-agent-gpu-support.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -ex -if [[ $AMI_TYPE != "al2gpu" && $AMI_TYPE != "al2keplergpu" ]]; then +if [[ $AMI_TYPE != "al2"*"gpu" ]]; then exit 0 fi @@ -23,6 +23,9 @@ enabled=1 exclude=libglvnd-* EOF +DKMS=/usr/sbin/dkms +DKMS_ARCHIVE_DIR=/var/lib/dkms-archive + # the amzn2-nvidia repo is temporary and only used for installing the system-release-nvidia package sudo mv $tmpfile /etc/yum.repos.d/amzn2-nvidia-tmp.repo @@ -45,11 +48,15 @@ if [[ $AMI_TYPE != "al2keplergpu" && -z ${AIR_GAPPED} ]]; then sudo yum install -y nvidia-kmod-common-${NVIDIA_VERSION} # build nvidia-open kmod tar - DKMS=/usr/sbin/dkms - DKMS_ARCHIVE_DIR=/var/lib/dkms-archive MODULE_NAME="nvidia-open" MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:') + if [[ $AMI_TYPE == *"kernel5dot10gpu" ]]; then + # explicitly use gcc10 since gcc version for compiling the NVIDIA driver must match gcc version with which the + # Linux kernel was compiled + sudo sed -i "s/'make' -j2 module/& CC=\/usr\/bin\/gcc10-cc/" /usr/src/${MODULE_NAME}-${MODULE_VERSION}/dkms.conf + fi + sudo ${DKMS} build -m "${MODULE_NAME}" -v "${MODULE_VERSION}" sudo ${DKMS} mktarball -m "${MODULE_NAME}" -v "${MODULE_VERSION}" sudo mkdir -p "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/" @@ -137,6 +144,18 @@ else cuda fi +if [[ $AMI_TYPE == *"kernel5dot10gpu" ]]; then + MODULE_NAME="nvidia" + MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:') + + # explicitly use gcc10 since gcc version for compiling the NVIDIA driver must match gcc version with which the + # Linux kernel was compiled + sudo sed -i "s/'make' -j2 module/& CC=\/usr\/bin\/gcc10-cc/" /usr/src/${MODULE_NAME}-${MODULE_VERSION}/dkms.conf + + # rebuild module/update drivers using gcc10 + sudo ${DKMS} install -m "${MODULE_NAME}" -v "${MODULE_VERSION}" +fi + # The Fabric Manager service needs to be started and enabled on EC2 P4d instances # in order to configure NVLinks and NVSwitches sudo systemctl enable nvidia-fabricmanager diff --git a/scripts/enable-ecs-agent-inferentia-support.sh b/scripts/enable-ecs-agent-inferentia-support.sh index 45f70e2..e959ea0 100644 --- a/scripts/enable-ecs-agent-inferentia-support.sh +++ b/scripts/enable-ecs-agent-inferentia-support.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -ex -if [[ $AMI_TYPE != "al2inf" && $AMI_TYPE != "al2023neu" ]]; then +if [[ $AMI_TYPE != "al2"*"inf" && $AMI_TYPE != "al2023neu" ]]; then exit 0 fi @@ -32,7 +32,7 @@ sudo yum install -y aws-neuronx-oci-hook-2.* # Install oci-add-hooks # TODO: oci-add-hooks package has compatibility issue with AL2023 IMDSv2. Remove condition after root caused and resolved -if [[ $AMI_TYPE == "al2inf" ]]; then +if [[ $AMI_TYPE == "al2"*"inf" ]]; then sudo yum install -y oci-add-hooks fi