Skip to content

Commit

Permalink
Merge pull request #747 from NVIDIA/custom-roots
Browse files Browse the repository at this point in the history
Add rootFS and driverInstallDir fields to ClusterPolicy
  • Loading branch information
cdesiniotis authored Jun 17, 2024
2 parents 7fb39d6 + d53e3e3 commit 2c4f301
Show file tree
Hide file tree
Showing 24 changed files with 912 additions and 168 deletions.
20 changes: 20 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ import (
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

const (
ClusterPolicyCRDName = "ClusterPolicy"
)

// ClusterPolicySpec defines the desired state of ClusterPolicy
type ClusterPolicySpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
Expand Down Expand Up @@ -88,6 +92,8 @@ type ClusterPolicySpec struct {
KataManager KataManagerSpec `json:"kataManager,omitempty"`
// CCManager component spec
CCManager CCManagerSpec `json:"ccManager,omitempty"`
// HostPaths defines various paths on the host needed by GPU Operator components
HostPaths HostPathsSpec `json:"hostPaths,omitempty"`
}

// Runtime defines container runtime type
Expand Down Expand Up @@ -144,6 +150,20 @@ type OperatorSpec struct {
UseOpenShiftDriverToolkit *bool `json:"use_ocp_driver_toolkit,omitempty"`
}

// HostPathsSpec defines various paths on the host needed by GPU Operator components
type HostPathsSpec struct {
// RootFS represents the path to the root filesystem of the host.
// This is used by components that need to interact with the host filesystem
// and as such this must be a chroot-able filesystem.
// Examples include the MIG Manager and Toolkit Container which may need to
// stop, start, or restart systemd services.
RootFS string `json:"rootFS,omitempty"`

// DriverInstallDir represents the root at which driver files including libraries,
// config files, and executables can be found.
DriverInstallDir string `json:"driverInstallDir,omitempty"`
}

// EnvVar represents an environment variable present in a Container.
type EnvVar struct {
// Name of the environment variable.
Expand Down
16 changes: 16 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions assets/state-container-toolkit/0200_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ rules:
- use
resourceNames:
- privileged
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- list
18 changes: 8 additions & 10 deletions assets/state-container-toolkit/0400_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,15 @@ data:
entrypoint.sh: |-
#!/bin/bash
set -e
until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done
driver_root=/run/nvidia/driver
driver_root_ctr_path=$driver_root
if [[ -f /run/nvidia/validations/host-driver-ready ]]; then
driver_root=/
driver_root_ctr_path=/host
fi
export NVIDIA_DRIVER_ROOT=$driver_root
export DRIVER_ROOT_CTR_PATH=$driver_root_ctr_path
set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
#
# The below delay is a workaround for an issue affecting some versions
Expand Down
25 changes: 18 additions & 7 deletions assets/state-container-toolkit/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,16 @@ spec:
value: "true"
- name: COMPONENT
value: driver
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
volumeMounts:
- name: driver-install-path
- name: driver-install-dir
mountPath: /run/nvidia/driver
mountPropagation: HostToContainer
- name: run-nvidia-validations
Expand All @@ -67,6 +71,8 @@ spec:
value: "management.nvidia.com/gpu"
- name: NVIDIA_VISIBLE_DEVICES
value: "void"
- name: TOOLKIT_PID_FILE
value: "/run/nvidia/toolkit/toolkit.pid"
imagePullPolicy: IfNotPresent
name: nvidia-container-toolkit-ctr
securityContext:
Expand All @@ -78,13 +84,17 @@ spec:
readOnly: true
mountPath: /bin/entrypoint.sh
subPath: entrypoint.sh
- name: nvidia-run-path
mountPath: /run/nvidia
mountPropagation: Bidirectional
- name: toolkit-root
mountPath: /run/nvidia/toolkit
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
- name: toolkit-install-dir
mountPath: /usr/local/nvidia
- name: crio-hooks
mountPath: /usr/share/containers/oci/hooks.d
- name: driver-install-dir
mountPath: /driver-root
mountPropagation: HostToContainer
- name: host-root
mountPath: /host
readOnly: true
Expand All @@ -96,17 +106,18 @@ spec:
configMap:
name: nvidia-container-toolkit-entrypoint
defaultMode: 448
- name: nvidia-run-path
- name: toolkit-root
hostPath:
path: /run/nvidia
path: /run/nvidia/toolkit
type: DirectoryOrCreate
- name: run-nvidia-validations
hostPath:
path: /run/nvidia/validations
type: DirectoryOrCreate
- name: driver-install-path
- name: driver-install-dir
hostPath:
path: /run/nvidia/driver
type: DirectoryOrCreate
- name: host-root
hostPath:
path: /
Expand Down
29 changes: 8 additions & 21 deletions assets/state-device-plugin/0400_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,15 @@ data:
entrypoint.sh: |-
#!/bin/bash
driver_root=""
container_driver_root=""
while true; do
if [[ -f /run/nvidia/validations/host-driver-ready ]]; then
driver_root=/
container_driver_root=/host
break
elif [[ -f /run/nvidia/validations/driver-ready ]]; then
driver_root=/run/nvidia/driver
container_driver_root=$driver_root
break
else
echo "waiting for the driver validations to be ready..."
sleep 5
fi
until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done
export NVIDIA_DRIVER_ROOT=$driver_root
echo "NVIDIA_DRIVER_ROOT=$NVIDIA_DRIVER_ROOT"
export CONTAINER_DRIVER_ROOT=$container_driver_root
echo "CONTAINER_DRIVER_ROOT=$CONTAINER_DRIVER_ROOT"
set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
echo "Starting nvidia-device-plugin"
exec nvidia-device-plugin
20 changes: 13 additions & 7 deletions assets/state-device-plugin/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: run-nvidia
mountPath: /run/nvidia
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
- image: "FILLED BY THE OPERATOR"
name: config-manager-init
Expand Down Expand Up @@ -91,8 +91,10 @@ spec:
subPath: entrypoint.sh
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: run-nvidia
mountPath: /run/nvidia
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
- name: driver-install-dir
mountPath: /driver-root
mountPropagation: HostToContainer
- name: host-root
mountPath: /host
Expand Down Expand Up @@ -141,10 +143,14 @@ spec:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: run-nvidia
- name: run-nvidia-validations
hostPath:
path: "/run/nvidia"
type: Directory
path: "/run/nvidia/validations"
type: DirectoryOrCreate
- name: driver-install-dir
hostPath:
path: "/run/nvidia/driver"
type: DirectoryOrCreate
- name: host-root
hostPath:
path: /
Expand Down
37 changes: 11 additions & 26 deletions assets/state-mig-manager/0420_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,19 @@ data:
entrypoint.sh: |-
#!/bin/bash
host_driver=""
driver_root=""
driver_root_ctr_path=""
while true; do
if [[ -f /run/nvidia/validations/host-driver-ready ]]; then
host_driver=true
driver_root="/"
driver_root_ctr_path="/host"
break
elif [[ -f /run/nvidia/validations/driver-ready ]]; then
host_driver=false
driver_root="/run/nvidia/driver"
driver_root_ctr_path="/run/nvidia/driver"
break
else
echo "waiting for the driver validations to be ready..."
sleep 5
fi
until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done
export WITH_SHUTDOWN_HOST_GPU_CLIENTS=$host_driver
set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
# manually export additional envs required by mig-manager
export WITH_SHUTDOWN_HOST_GPU_CLIENTS=$IS_HOST_DRIVER
echo "WITH_SHUTDOWN_HOST_GPU_CLIENTS=$WITH_SHUTDOWN_HOST_GPU_CLIENTS"
export DRIVER_ROOT=$driver_root
echo "DRIVER_ROOT=$DRIVER_ROOT"
export DRIVER_ROOT_CTR_PATH=$driver_root_ctr_path
echo "DRIVER_ROOT_CTR_PATH=$DRIVER_ROOT_CTR_PATH"
echo "Starting nvidia-mig-manager"
exec nvidia-mig-manager
20 changes: 13 additions & 7 deletions assets/state-mig-manager/0600_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: run-nvidia
mountPath: /run/nvidia
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
containers:
- name: nvidia-mig-manager
Expand Down Expand Up @@ -62,6 +62,8 @@ spec:
readOnly: true
mountPath: /bin/entrypoint.sh
subPath: entrypoint.sh
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
- mountPath: /sys
name: host-sys
- mountPath: /mig-parted-config
Expand All @@ -71,8 +73,8 @@ spec:
mountPropagation: HostToContainer
- mountPath: /gpu-clients
name: gpu-clients
- name: run-nvidia
mountPath: /run/nvidia
- name: driver-install-dir
mountPath: /driver-root
mountPropagation: HostToContainer
- name: cdi-root
mountPath: /var/run/cdi
Expand All @@ -88,10 +90,14 @@ spec:
- name: mig-parted-config
configMap:
name: "FILLED_BY_OPERATOR"
- name: run-nvidia
- name: run-nvidia-validations
hostPath:
path: "/run/nvidia"
type: Directory
path: "/run/nvidia/validations"
type: DirectoryOrCreate
- name: driver-install-dir
hostPath:
path: "/run/nvidia/driver"
type: DirectoryOrCreate
- name: host-root
hostPath:
path: "/"
Expand Down
8 changes: 6 additions & 2 deletions assets/state-operator-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ spec:
value: "true"
- name: COMPONENT
value: driver
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
securityContext:
privileged: true
seLinuxOptions:
Expand All @@ -44,7 +48,7 @@ spec:
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: driver-install-path
- name: driver-install-dir
mountPath: /run/nvidia/driver
mountPropagation: HostToContainer
- name: run-nvidia-validations
Expand Down Expand Up @@ -160,7 +164,7 @@ spec:
hostPath:
path: /run/nvidia/validations
type: DirectoryOrCreate
- name: driver-install-path
- name: driver-install-dir
hostPath:
path: /run/nvidia/driver
- name: host-root
Expand Down
Loading

0 comments on commit 2c4f301

Please sign in to comment.