From f0eee0dae16139e2d6c83ef838b6f6c42efb1be3 Mon Sep 17 00:00:00 2001 From: Lachlan Evenson Date: Fri, 17 Aug 2018 10:59:35 -0700 Subject: [PATCH] Add dkms to manage nvidia gpu kmod compliation across different linux kernels (#3688) --- parts/k8s/kubernetesagentcustomdata.yml | 17 +++++++++++++++++ pkg/acsengine/engine.go | 5 +++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/parts/k8s/kubernetesagentcustomdata.yml b/parts/k8s/kubernetesagentcustomdata.yml index 7738883088..9db9059a79 100644 --- a/parts/k8s/kubernetesagentcustomdata.yml +++ b/parts/k8s/kubernetesagentcustomdata.yml @@ -80,6 +80,23 @@ write_files: WantedBy=multi-user.target {{end}} +{{if IsNSeriesSKU .}} +- path: "/etc/systemd/system/nvidia-modprobe.service" + permissions: "0644" + owner: "root" + content: | + [Unit] + Description=Installs and loads Nvidia GPU kernel module + [Service] + Type=oneshot + RemainAfterExit=true + ExecStartPre=/bin/sh -c "dkms autoinstall --verbose" + ExecStart=/bin/sh -c "nvidia-modprobe -u -c0" + ExecStartPost=/bin/sh -c "sleep 10 && systemctl restart kubelet" + [Install] + WantedBy=multi-user.target +{{end}} + - path: "/etc/kubernetes/certs/ca.crt" permissions: "0644" encoding: "base64" diff --git a/pkg/acsengine/engine.go b/pkg/acsengine/engine.go index 404a8b2116..7391bd0ba7 100644 --- a/pkg/acsengine/engine.go +++ b/pkg/acsengine/engine.go @@ -484,7 +484,7 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { - retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list > /tmp/nvidia-docker.list - cat /tmp/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - apt_get_update -- retrycmd_if_failure 5 5 300 apt-get install -y linux-headers-$(uname -r) gcc make +- retrycmd_if_failure 5 5 300 apt-get install -y linux-headers-$(uname -r) gcc make dkms - retrycmd_if_failure 5 5 300 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-docker2=%s+docker%s nvidia-container-runtime=%s+docker%s - sudo pkill -SIGHUP dockerd - mkdir -p %s @@ -505,13 +505,14 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts) */ installScript += fmt.Sprintf(` -- sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s" +- sh nvidia-drivers-%s --silent --accept-license --no-drm --dkms --utility-prefix="%s" --opengl-prefix="%s" - echo "%s" > /etc/ld.so.conf.d/nvidia.conf - sudo ldconfig - umount -l /usr/lib/x86_64-linux-gnu - nvidia-modprobe -u -c0 - %s/bin/nvidia-smi - sudo ldconfig +- systemctl enable nvidia-modprobe - retrycmd_if_failure 5 10 60 systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) /* If a new GPU sku becomes available, add a key to this map, but only provide an installation script if you have a confirmation