Skip to content

Commit

Permalink
Add 'hostPaths.driverInstallDir' field to ClusterPolicy
Browse files Browse the repository at this point in the history
This allows for non-standard driver container installations, where
the driver installation path and device nodes are rooted at paths
other than '/run/nvidia/driver'.

Note, setting driverInstallDir to a custom value is currently
only supported for driver container installations not managed by
by GPU Operator. For example, in the GKE use case where a driver
daemonset is deployed prior to installing GPU Operator and the GPU
Operator managed driver is disabled.

The GPU Operator's driver container daemonset still assumes that
the full driver installation is made available at '/run/nvidia/driver'
on the host, and consequently, we always mount '/run/nvidia/driver'
into the GPU Operator managed daemonset. We may consider removing this
assumption in the future and support driver container implementations
which allow for a custom driverInstallDir to be specified.

Signed-off-by: Christopher Desiniotis <[email protected]>
  • Loading branch information
cdesiniotis committed Jun 17, 2024
1 parent 8e4021c commit d53e3e3
Show file tree
Hide file tree
Showing 12 changed files with 324 additions and 52 deletions.
4 changes: 4 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ type HostPathsSpec struct {
// Examples include the MIG Manager and Toolkit Container which may need to
// stop, start, or restart systemd services.
RootFS string `json:"rootFS,omitempty"`

// DriverInstallDir represents the root at which driver files including libraries,
// config files, and executables can be found.
DriverInstallDir string `json:"driverInstallDir,omitempty"`
}

// EnvVar represents an environment variable present in a Container.
Expand Down
5 changes: 5 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,11 @@ spec:
description: HostPaths defines various paths on the host needed by
GPU Operator components
properties:
driverInstallDir:
description: |-
DriverInstallDir represents the root at which driver files including libraries,
config files, and executables can be found.
type: string
rootFS:
description: |-
RootFS represents the path to the root filesystem of the host.
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,11 @@ spec:
description: HostPaths defines various paths on the host needed by
GPU Operator components
properties:
driverInstallDir:
description: |-
DriverInstallDir represents the root at which driver files including libraries,
config files, and executables can be found.
type: string
rootFS:
description: |-
RootFS represents the path to the root filesystem of the host.
Expand Down
43 changes: 43 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ const (
DefaultMPSRoot = "/run/nvidia/mps"
// HostRootEnvName is the name of the envvar representing the root path of the underlying host
HostRootEnvName = "HOST_ROOT"
// DefaultDriverInstallDir represents the default path of a driver container installation
DefaultDriverInstallDir = "/run/nvidia/driver"
// DriverInstallDirEnvName is the name of the envvar used by the driver-validator to represent the driver install dir
DriverInstallDirEnvName = "DRIVER_INSTALL_DIR"
// DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path
// of the driver install dir mounted in the container
DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
)

// ContainerProbe defines container probe types
Expand Down Expand Up @@ -717,6 +724,9 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error
// transform the host-root and host-dev-char volumes if a custom host root is configured with the operator
transformForHostRoot(obj, n.singleton.Spec.HostPaths.RootFS)

// transform the driver-root volume if a custom driver install dir is configured with the operator
transformForDriverInstallDir(obj, n.singleton.Spec.HostPaths.DriverInstallDir)

// apply per operand Daemonset config
err = t(obj, &n.singleton.Spec, n)
if err != nil {
Expand Down Expand Up @@ -820,6 +830,39 @@ func transformHostDevCharVolume(obj *appsv1.DaemonSet, hostRoot string) {
}
}

// apply necessary transforms if a custom driver install directory is configured
func transformForDriverInstallDir(obj *appsv1.DaemonSet, driverInstallDir string) {
if driverInstallDir == "" || driverInstallDir == DefaultDriverInstallDir {
return
}

containsDriverInstallDirVolume := false
podSpec := obj.Spec.Template.Spec
for _, volume := range podSpec.Volumes {
if volume.Name == "driver-install-dir" {
volume.HostPath.Path = driverInstallDir
containsDriverInstallDirVolume = true
break
}
}

if !containsDriverInstallDirVolume {
return
}

for i, ctr := range podSpec.InitContainers {
if ctr.Name == "driver-validation" {
setContainerEnv(&(podSpec.InitContainers[i]), DriverInstallDirEnvName, driverInstallDir)
setContainerEnv(&(podSpec.InitContainers[i]), DriverInstallDirCtrPathEnvName, driverInstallDir)
for j, volumeMount := range ctr.VolumeMounts {
if volumeMount.Name == "driver-install-dir" {
podSpec.InitContainers[i].VolumeMounts[j].MountPath = driverInstallDir
}
}
}
}
}

// TransformGPUDiscoveryPlugin transforms GPU discovery daemonset with required config as per ClusterPolicy
func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update validation container
Expand Down
85 changes: 85 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ func (d Daemonset) WithEnvVar(name string, value string) Daemonset {
return d
}

func (d Daemonset) WithInitContainer(container corev1.Container) Daemonset {
d.Spec.Template.Spec.InitContainers = append(d.Spec.Template.Spec.InitContainers, container)
return d
}

func TestTransformForHostRoot(t *testing.T) {
hostRootVolumeName := "host-root"
hostDevCharVolumeName := "host-dev-char"
Expand Down Expand Up @@ -139,3 +144,83 @@ func TestTransformForHostRoot(t *testing.T) {
})
}
}

func TestTransformForDriverInstallDir(t *testing.T) {
driverInstallDirVolumeName := "driver-install-dir"
testCases := []struct {
description string
driverInstallDir string
input Daemonset
expectedOutput Daemonset
}{
{
description: "no driver-install-dir volume in daemonset",
driverInstallDir: "/custom-root",
input: NewDaemonset(),
expectedOutput: NewDaemonset(),
},
{
description: "empty driverInstallDir is a no-op",
driverInstallDir: "",
input: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/run/nvidia/driver").
WithInitContainer(
corev1.Container{
Name: "driver-validation",
VolumeMounts: []corev1.VolumeMount{
{Name: driverInstallDirVolumeName, MountPath: "/run/nvidia/driver"},
},
}),
expectedOutput: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/run/nvidia/driver").
WithInitContainer(
corev1.Container{
Name: "driver-validation",
VolumeMounts: []corev1.VolumeMount{
{Name: driverInstallDirVolumeName, MountPath: "/run/nvidia/driver"},
},
}),
},
{
description: "custom driverInstallDir with driver-install-dir volume",
driverInstallDir: "/custom-root",
input: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/run/nvidia/driver"),
expectedOutput: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/custom-root"),
},
{
description: "custom driverInstallDir with driver-install-dir volume and driver-validation initContainer",
driverInstallDir: "/custom-root",
input: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/run/nvidia/driver").
WithInitContainer(
corev1.Container{
Name: "driver-validation",
VolumeMounts: []corev1.VolumeMount{
{Name: driverInstallDirVolumeName, MountPath: "/run/nvidia/driver"},
},
}),
expectedOutput: NewDaemonset().
WithHostPathVolume(driverInstallDirVolumeName, "/custom-root").
WithInitContainer(
corev1.Container{
Name: "driver-validation",
VolumeMounts: []corev1.VolumeMount{
{Name: driverInstallDirVolumeName, MountPath: "/custom-root"},
},
Env: []corev1.EnvVar{
{Name: DriverInstallDirEnvName, Value: "/custom-root"},
{Name: DriverInstallDirCtrPathEnvName, Value: "/custom-root"},
},
}),
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
transformForDriverInstallDir(tc.input.DaemonSet, tc.driverInstallDir)
require.EqualValues(t, tc.expectedOutput, tc.input)
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,11 @@ spec:
description: HostPaths defines various paths on the host needed by
GPU Operator components
properties:
driverInstallDir:
description: |-
DriverInstallDir represents the root at which driver files including libraries,
config files, and executables can be found.
type: string
rootFS:
description: |-
RootFS represents the path to the root filesystem of the host.
Expand Down
1 change: 1 addition & 0 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ metadata:
spec:
hostPaths:
rootFS: {{ .Values.hostPaths.rootFS }}
driverInstallDir: {{ .Values.hostPaths.driverInstallDir }}
operator:
{{- if .Values.operator.defaultRuntime }}
defaultRuntime: {{ .Values.operator.defaultRuntime }}
Expand Down
4 changes: 4 additions & 0 deletions deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ hostPaths:
# stop, start, or restart systemd services
rootFS: "/"

# driverInstallDir represents the root at which driver files including libraries,
# config files, and executables can be found.
driverInstallDir: "/run/nvidia/driver"

daemonsets:
labels: {}
annotations: {}
Expand Down
73 changes: 73 additions & 0 deletions validator/driver.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/

package main

// driverInfo contains information about an NVIDIA driver installation.
//
// isHostDriver indicates whether the driver is installed directly on
// the host at the host's root filesystem.
//
// hostRoot represents the host's root filesystem (typically '/').
//
// driverRoot and devRoot represent the absolute paths of the driver install
// and NVIDIA device nodes on the host.
//
// driverRootCtrPath and devRootCtrPath represent the paths of the driver install
// and NVIDIA device nodes in the management containers that require them, like
// the Toolkit Container, the Device Plugin, and MIG Manager.
type driverInfo struct {
isHostDriver bool
hostRoot string
driverRoot string
driverRootCtrPath string
devRoot string
devRootCtrPath string
}

func getDriverInfo(isHostDriver bool, hostRoot string, driverInstallDir string, driverInstallDirCtrPath string) driverInfo {
if isHostDriver {
return driverInfo{
isHostDriver: true,
hostRoot: hostRoot,
driverRoot: hostRoot,
driverRootCtrPath: "/host",
devRoot: hostRoot,
devRootCtrPath: "/host",
}
}

// For drivers not installed directly on the host, devRoot can either be
// hostRoot or driverInstallDir
var devRoot, devRootCtrPath string
devRoot = root(driverInstallDirCtrPath).getDevRoot()
if devRoot == "/" {
devRoot = hostRoot
devRootCtrPath = "/host"
} else {
devRoot = driverInstallDir
devRootCtrPath = "/driver-root"
}

return driverInfo{
isHostDriver: false,
hostRoot: hostRoot,
driverRoot: driverInstallDir,
driverRootCtrPath: "/driver-root",
devRoot: devRoot,
devRootCtrPath: devRootCtrPath,
}
}
20 changes: 20 additions & 0 deletions validator/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main

import (
"fmt"
"os"
"path/filepath"
)

Expand Down Expand Up @@ -60,6 +61,25 @@ func (r root) getNvidiaSMIPath() (string, error) {
return binaryPath, nil
}

// isDevRoot checks whether the specified root is a dev root.
// A dev root is defined as a root containing a /dev folder.
func (r root) isDevRoot() bool {
stat, err := os.Stat(filepath.Join(string(r), "dev"))
if err != nil {
return false
}
return stat.IsDir()
}

// getDevRoot returns the dev root associated with the root.
// If the root is not a dev root, this defaults to "/".
func (r root) getDevRoot() string {
if r.isDevRoot() {
return string(r)
}
return "/"
}

// findFile searches the root for a specified file.
// A number of folders can be specified to search in addition to the root itself.
// If the file represents a symlink, this is resolved and the final path is returned.
Expand Down
Loading

0 comments on commit d53e3e3

Please sign in to comment.