Merge branch 'main' into git-workflow-linter

NVIDIA · May 8, 2024 · 0d12cb8 · 0d12cb8
2 parents 29b9a9d + 11a08a2
commit 0d12cb8
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 4 deletions.
diff --git a/docs/deployment.md b/docs/deployment.md
@@ -49,17 +49,17 @@ kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/ch
 
 For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file.
 
-Currently, the system supports the following node types:
+Currently, the system includes the following node types:
 - [dgxa100.40g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
 - [dgxa100.80g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
 - [dgxh100.80g](https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html#hardware-overview)
 - cpu.x86
 
-If you need to introduce additional node types, update the `values.yaml` file with the necessary node information (such as type and count) and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
+If you need to introduce additional node types, update the values file used for node configuration with the node information (such as type and count), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
 
-To deploy these nodes, use the Helm command:
+To deploy the nodes in `values-example.yaml`, use the Helm command:
 ```bash
-helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values.yaml
+helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml
 ```
 
 ## Running Knavigator

diff --git a/resources/templates/k8s/failed-job.yml b/resources/templates/k8s/failed-job.yml
@@ -0,0 +1,40 @@
+apiVersion: batch/v1
+kind: Job 
+metadata:
+  name: "{{._NAME_}}"
+  namespace: "{{.namespace}}"
+spec:
+  backoffLimit: {{.backoffLimit}}
+  completions: {{.completions}} 
+  parallelism: {{.parallelism}}
+  completionMode: {{.completionMode}}
+  template:
+    metadata:
+      labels:
+        pod-init-container-running-failed.stage.kwok.x-k8s.io: "true"
+      annotations:
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: "{{.failureContainerName}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: "{{.failureReason}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "{{.failureMessage}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "{{.failureExitCode}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "{{.failureDelay}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "{{.failureJitterDelay}}"
+    spec:
+      schedulerName: default-scheduler
+      initContainers:
+      - name: {{.failureContainerName}}
+        image: {{.initContainerImage}}
+      containers:
+      - name: test
+        image: {{.containerImage}}
+        imagePullPolicy: IfNotPresent
+        resources:
+          limits:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+          requests:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+      restartPolicy: OnFailure
diff --git a/resources/tests/k8s/test-failed-job.yml b/resources/tests/k8s/test-failed-job.yml
@@ -0,0 +1,37 @@
+name: test-k8s-job
+description: submit and validate a k8s job
+tasks:
+- id: job
+  type: SubmitObj
+  params:
+    count: 1
+    grv:
+      group: batch
+      version: v1
+      resource: jobs
+    template: "resources/templates/k8s/failed-job.yml"
+    nameformat: "job{{._ENUM_}}"
+    overrides:
+      namespace: default
+      parallelism: 1
+      completions: 1
+      backoffLimit: 0
+      completionMode: NonIndexed
+      image: ubuntu
+      cpu: 100m
+      memory: 512M
+      gpu: 8
+      containerImage: ubuntu
+      initContainerImage: nccl
+      failureContainerName: nccl-test
+      failureReason: nccl-test-failed
+      failureMessage: "nccl test failed"
+      failureExitCode: 1
+      failureDelay: 1000
+      failureJitterDelay: 5000
+- id: status
+  type: CheckPod
+  params:
+    refTaskId: job 
+    status: Failed
+    timeout: 10s