From 3aabc125456ea795499078dba797730010ae7d6c Mon Sep 17 00:00:00 2001 From: Charlie Drage Date: Thu, 14 Nov 2024 12:39:15 -0500 Subject: [PATCH] update nvidia container --- README.md | 33 ------------ bootc-nvidia-base-centos/Containerfile | 54 ------------------- bootc-nvidia-base-centos/README.md | 29 ---------- .../lib/systemd/system/nvidia-drivers.service | 24 --------- .../system/nvidia-toolkit-firstboot.service | 18 ------- 5 files changed, 158 deletions(-) delete mode 100644 bootc-nvidia-base-centos/Containerfile delete mode 100644 bootc-nvidia-base-centos/README.md delete mode 100644 bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-drivers.service delete mode 100644 bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-toolkit-firstboot.service diff --git a/README.md b/README.md index 96643de..5d735fb 100755 --- a/README.md +++ b/README.md @@ -34,7 +34,6 @@ Below is a general overview (with instructions) on each Docker container I use. - [bootc-k3s-master-amd64](#bootc-k3s-master-amd64) - [bootc-k3s-node-amd64](#bootc-k3s-node-amd64) - [bootc-microshift-centos](#bootc-microshift-centos) -- [bootc-nvidia-base-centos](#bootc-nvidia-base-centos) - [bootc-nvidia-base-fedora](#bootc-nvidia-base-fedora) - [cat](#cat) - [gameserver](#gameserver) @@ -209,38 +208,6 @@ Below is a general overview (with instructions) on each Docker container I use. RUN echo -e ' OpenShift 4.17 release\n\ Dependencies\n\ -## [bootc-nvidia-base-centos](/bootc-nvidia-base-centos/Containerfile) - - **Description:** - > IMPORTANT NOTE: This is BOOTC. This is meant for bootable container applications. See: https://github.com/containers/podman-desktop-extension-bootc - - This is a "base" container that installs the nvidia drivers and the nvidia container toolkit. - This is meant to be used as a base for other containers that need GPU access. - - DISABLE SECURE BOOT! You have been warned! Disable boot is **KNOWN** to cause issues with the nvidia drivers. - ENABLE 4G DECODING in the BIOS. This is needed for certain nvidia cards to work such as the Tesla P40. - - This uses Centos Stream 9 as the base image to (hopefully) be as stable as possible. Tried with Fedora 40 but found that the kernel was moving too fast - for the nvidia drivers to keep up / work properly / update correctly. - - IMPORTANT NOTE: - On boot, this will **not** have the nvidia drivers loaded it they are compiled. This is because akmods are suppose to be built on boot, but this doesn't work with bootc. - Instead, the nvidia drivers will recompile + use akmod + modprobe on boot.. and may take a minute to load. - If you have any systemd services that require the nvidia drivers, you will need to add a `After=nvidia-drivers.service` to the service or have it LATE in the boot order (ex. multi-user.target) - to ensure that the nvidia drivers are loaded before the service starts. - - For example, if you have a podman container with --restart=always, you will need to add a `After=nvidia-drivers.service` to the podman-restart.service and podman-restart.timer. file. - This has been done for you already within the nvidia-drivers.service and nvidia-toolkit-firstboot.service files. - - Note about nvidia-toolkit-fristboot.service file: This is a one-time service on boot that will create the /etc/cdi/nvidia.yaml file. This is necessary for podman - to use gpu devices. - - - **Running:** - 1. In your OTHER Containerfile, change to `FROM git.k8s.land/cdrage/bootc-nvidia-base-centos` / this Containerfile. - 2. The nvidia drivers will recompile + use akmod + modprobe on boot. - 3. Use nvidia-smi command within the booted container image to see if it works. - ## [bootc-nvidia-base-fedora](/bootc-nvidia-base-fedora/Containerfile) **Description:** diff --git a/bootc-nvidia-base-centos/Containerfile b/bootc-nvidia-base-centos/Containerfile deleted file mode 100644 index ee5cf59..0000000 --- a/bootc-nvidia-base-centos/Containerfile +++ /dev/null @@ -1,54 +0,0 @@ -# **Description:** -# > IMPORTANT NOTE: This is BOOTC. This is meant for bootable container applications. See: https://github.com/containers/podman-desktop-extension-bootc -# -# This is a "base" container that installs the nvidia drivers and the nvidia container toolkit. -# This is meant to be used as a base for other containers that need GPU access. -# -# DISABLE SECURE BOOT! You have been warned! Disable boot is **KNOWN** to cause issues with the nvidia drivers. -# ENABLE 4G DECODING in the BIOS. This is needed for certain nvidia cards to work such as the Tesla P40. -# -# This uses Centos Stream 9 as the base image to (hopefully) be as stable as possible. Tried with Fedora 40 but found that the kernel was moving too fast -# for the nvidia drivers to keep up / work properly / update correctly. -# -# IMPORTANT NOTE: -# On boot, this will **not** have the nvidia drivers loaded it they are compiled. This is because akmods are suppose to be built on boot, but this doesn't work with bootc. -# Instead, the nvidia drivers will recompile + use akmod + modprobe on boot.. and may take a minute to load. -# If you have any systemd services that require the nvidia drivers, you will need to add a `After=nvidia-drivers.service` to the service or have it LATE in the boot order (ex. multi-user.target) -# to ensure that the nvidia drivers are loaded before the service starts. -# -# For example, if you have a podman container with --restart=always, you will need to add a `After=nvidia-drivers.service` to the podman-restart.service and podman-restart.timer. file. -# This has been done for you already within the nvidia-drivers.service and nvidia-toolkit-firstboot.service files. -# -# Note about nvidia-toolkit-fristboot.service file: This is a one-time service on boot that will create the /etc/cdi/nvidia.yaml file. This is necessary for podman -# to use gpu devices. -# -# -# **Running:** -# 1. In your OTHER Containerfile, change to `FROM git.k8s.land/cdrage/bootc-nvidia-base-centos` / this Containerfile. -# 2. The nvidia drivers will recompile + use akmod + modprobe on boot. -# 3. Use nvidia-smi command within the booted container image to see if it works. -FROM quay.io/centos-bootc/centos-bootc:stream9 - -#! Install EPEL as well as free/non-free rpm fusion repo's for access to the nvidia drivers -RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ - https://mirrors.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm && \ - https://mirrors.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm - -#! Install the kernel devel and kernel header tools -RUN dnf install -y kernel-devel kernel-headers - -#! Install the nvidia drivers -RUN dnf install -y akmod-nvidia xorg-x11-drv-nvidia-cuda - -#! Install NVIDIA container toolkit -RUN curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo && \ - dnf install -y nvidia-container-toolkit - -#! Blacklist the nouveau driver to ensure NVIDIA drivers function properly -RUN echo "blacklist nouveau" > /etc/modprobe.d/blacklist_nouveau.conf - -#! Copy necessary usr files -COPY usr/ /usr/ - -#! Enable necessary services to be started at boot -RUN systemctl enable nvidia-toolkit-firstboot.service nvidia-drivers.service \ No newline at end of file diff --git a/bootc-nvidia-base-centos/README.md b/bootc-nvidia-base-centos/README.md deleted file mode 100644 index 1f53bfb..0000000 --- a/bootc-nvidia-base-centos/README.md +++ /dev/null @@ -1,29 +0,0 @@ - **Description:** - > IMPORTANT NOTE: This is BOOTC. This is meant for bootable container applications. See: https://github.com/containers/podman-desktop-extension-bootc - - This is a "base" container that installs the nvidia drivers and the nvidia container toolkit. - This is meant to be used as a base for other containers that need GPU access. - - DISABLE SECURE BOOT! You have been warned! Disable boot is **KNOWN** to cause issues with the nvidia drivers. - ENABLE 4G DECODING in the BIOS. This is needed for certain nvidia cards to work such as the Tesla P40. - - This uses Centos Stream 9 as the base image to (hopefully) be as stable as possible. Tried with Fedora 40 but found that the kernel was moving too fast - for the nvidia drivers to keep up / work properly / update correctly. - - IMPORTANT NOTE: - On boot, this will **not** have the nvidia drivers loaded it they are compiled. This is because akmods are suppose to be built on boot, but this doesn't work with bootc. - Instead, the nvidia drivers will recompile + use akmod + modprobe on boot.. and may take a minute to load. - If you have any systemd services that require the nvidia drivers, you will need to add a `After=nvidia-drivers.service` to the service or have it LATE in the boot order (ex. multi-user.target) - to ensure that the nvidia drivers are loaded before the service starts. - - For example, if you have a podman container with --restart=always, you will need to add a `After=nvidia-drivers.service` to the podman-restart.service and podman-restart.timer. file. - This has been done for you already within the nvidia-drivers.service and nvidia-toolkit-firstboot.service files. - - Note about nvidia-toolkit-fristboot.service file: This is a one-time service on boot that will create the /etc/cdi/nvidia.yaml file. This is necessary for podman - to use gpu devices. - - - **Running:** - 1. In your OTHER Containerfile, change to `FROM git.k8s.land/cdrage/bootc-nvidia-base-centos` / this Containerfile. - 2. The nvidia drivers will recompile + use akmod + modprobe on boot. - 3. Use nvidia-smi command within the booted container image to see if it works. diff --git a/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-drivers.service b/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-drivers.service deleted file mode 100644 index 70427f5..0000000 --- a/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-drivers.service +++ /dev/null @@ -1,24 +0,0 @@ -[Unit] -Description=Bootc User Overlay and NVIDIA Setup will generate the kernel module and load the nvidia driver - -# Done before k3s and toolkit-firstboot -Before=nvidia-toolkit-firstboot.service -# Must be done BEFORE the podman-restart.service or podman.service (if using API) in case we are using GPU for podman for testing nvidia-smi -Before=podman-restart.service podman.service - -# Ensure it runs before multi-user.target which would load -# services such as k3s, etc. -Before=multi-user.target - -# VERY VERY BAD way of implementing this as we have to do usroverlay just to get the nvidia driver to work -# but I do not know how to get the nvidia driver to work without usroverlay to build the kernel and load it. -[Service] -Type=oneshot -ExecStart=-/usr/bin/bootc usroverlay -ExecStart=/usr/sbin/akmods --force -ExecStart=/usr/sbin/modprobe nvidia -RemainAfterExit=true -TimeoutStartSec=300 - -[Install] -WantedBy=basic.target \ No newline at end of file diff --git a/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-toolkit-firstboot.service b/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-toolkit-firstboot.service deleted file mode 100644 index 8b112fd..0000000 --- a/bootc-nvidia-base-centos/usr/lib/systemd/system/nvidia-toolkit-firstboot.service +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -# For more information see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html -Description=Generate /etc/cdi/nvidia.yaml to be used by Podman -# Ensure we do this AFTER the nvidia-drivers.service -After=nvidia-drivers.service -# Must be done BEFORE the podman-restart.service or podman.service (if using API) -# since /etc/cdi/nvidia.yaml is used by podman to access GPU -Before=podman-restart.service podman.service - -[Service] -Type=oneshot -ExecStart=-/usr/bin/mkdir -p /etc/cdi -ExecStart=/bin/bash -c '/usr/bin/nvidia-ctk cdi generate > /etc/cdi/nvidia.yaml' -RemainAfterExit=yes -TimeoutStartSec=300 - -[Install] -WantedBy=basic.target