Skip to content

Commit

Permalink
Add nvidia-open kernel module installation steps
Browse files Browse the repository at this point in the history
  • Loading branch information
fierlion committed Nov 10, 2023
1 parent 598499e commit 22a4ae9
Showing 1 changed file with 46 additions and 1 deletion.
47 changes: 46 additions & 1 deletion scripts/enable-ecs-agent-gpu-support.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,42 @@
#!/usr/bin/env bash
set -ex

if [[ $AMI_TYPE != "al2gpu" && $AMI_TYPE != "al2keplergpu" ]]; then
if [[ $AMI_TYPE != "al2gpu" && $AMI_TYPE != "al2keplergpu" && AMI_TYPE != "opengpu" ]]; then
exit 0
fi

if [[ $AMI_TYPE == "opengpu" ]]; then
# install and build open kernel module tar
sudo yum install -y yum-plugin-versionlock \
yum-utils
sudo amazon-linux-extras install epel -y

# disable amzn2 in favor of rh repo
sudo yum-config-manager --disable amzn2-nvidia
sudo yum-config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum-config-manager --enable cuda-rhel7.repo

# install open dkms from rh repo
sudo yum install -y kmod-nvidia-open-dkms \
nvidia-kmod-common

DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive
MODULE_NAME="nvidia-open"
MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:')

sudo ${DKMS} build -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
sudo ${DKMS} mktarball -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
sudo mkdir -p "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/"
sudo cp /var/lib/dkms/${MODULE_NAME}/${MODULE_VERSION}/tarball/*.tar.gz "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/"

# re-enable amzn2 and clean up
sudo yum-config-manager --disable cuda-rhel7.repo
sudo rm /etc/yum.repos.d/cuda-rhel7.repo
sudo rm -rf /var/cache/yum
sudo yum-config-manager --enable amzn2-nvidia
fi

GPG_CHECK=1
# don't do the gpg check in air-gapped regions
if [ -n "$AIR_GAPPED" ]; then
Expand Down Expand Up @@ -63,6 +95,19 @@ else
sudo yum install -y cuda-drivers \
cuda
fi

if [[ $AMI_TYPE == "opengpu" ]]; then
# remove closed-source nvidia kernel module
MODULE_NAME="nvidia"
MODULE_VERSION=$(${DKMS} status -m ${MODULE_NAME} | awk '{print $2}' | tr -d ',:')
sudo ${DKMS} remove -m "${MODULE_NAME}" -v "${MODULE_VERSION}" --all

# load open module from tarball
MODULE_NAME="nvidia-open"
MODULE_ARCHIVE="${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/*.tar.gz"
sudo ${DKMS} ldtarball ${MODULE_ARCHIVE}
fi

# The Fabric Manager service needs to be started and enabled on EC2 P4d instances
# in order to configure NVLinks and NVSwitches
sudo systemctl enable nvidia-fabricmanager
Expand Down

0 comments on commit 22a4ae9

Please sign in to comment.