From b989dcd50d1b0feb8ffa94ef090b28d4ff78c459 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Wed, 8 May 2024 08:44:04 -0700
Subject: [PATCH 1/2] Add test for inject failure to init containers

Signed-off-by: Yuan Chen <yuanc@nvidia.com>

Update failure test

Create job failure test for k8s

Signed-off-by: Yuan Chen <yuanc@nvidia.com>
---
 resources/templates/k8s/failed-job.yml  | 40 +++++++++++++++++++++++++
 resources/tests/k8s/test-failed-job.yml | 37 +++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 resources/templates/k8s/failed-job.yml
 create mode 100644 resources/tests/k8s/test-failed-job.yml

diff --git a/resources/templates/k8s/failed-job.yml b/resources/templates/k8s/failed-job.yml
new file mode 100644
index 0000000..ad011a0
--- /dev/null
+++ b/resources/templates/k8s/failed-job.yml
@@ -0,0 +1,40 @@
+apiVersion: batch/v1
+kind: Job 
+metadata:
+  name: "{{._NAME_}}"
+  namespace: "{{.namespace}}"
+spec:
+  backoffLimit: {{.backoffLimit}}
+  completions: {{.completions}} 
+  parallelism: {{.parallelism}}
+  completionMode: {{.completionMode}}
+  template:
+    metadata:
+      labels:
+        pod-init-container-running-failed.stage.kwok.x-k8s.io: "true"
+      annotations:
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: "{{.failureContainerName}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: "{{.failureReason}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "{{.failureMessage}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "{{.failureExitCode}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "{{.failureDelay}}"
+        pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "{{.failureJitterDelay}}"
+    spec:
+      schedulerName: default-scheduler
+      initContainers:
+      - name: {{.failureContainerName}}
+        image: {{.initContainerImage}}
+      containers:
+      - name: test
+        image: {{.containerImage}}
+        imagePullPolicy: IfNotPresent
+        resources:
+          limits:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+          requests:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+      restartPolicy: OnFailure
diff --git a/resources/tests/k8s/test-failed-job.yml b/resources/tests/k8s/test-failed-job.yml
new file mode 100644
index 0000000..53e6b67
--- /dev/null
+++ b/resources/tests/k8s/test-failed-job.yml
@@ -0,0 +1,37 @@
+name: test-k8s-job
+description: submit and validate a k8s job
+tasks:
+- id: job
+  type: SubmitObj
+  params:
+    count: 1
+    grv:
+      group: batch
+      version: v1
+      resource: jobs
+    template: "resources/templates/k8s/failed-job.yml"
+    nameformat: "job{{._ENUM_}}"
+    overrides:
+      namespace: default
+      parallelism: 1
+      completions: 1
+      backoffLimit: 0
+      completionMode: NonIndexed
+      image: ubuntu
+      cpu: 100m
+      memory: 512M
+      gpu: 8
+      containerImage: ubuntu
+      initContainerImage: nccl
+      failureContainerName: nccl-test
+      failureReason: nccl-test-failed
+      failureMessage: "nccl test failed"
+      failureExitCode: 1
+      failureDelay: 1000
+      failureJitterDelay: 5000
+- id: status
+  type: CheckPod
+  params:
+    refTaskId: job 
+    status: Failed
+    timeout: 10s

From 1dbde6b3b51e9616379f6270e6bb58464496ca01 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanc@nvidia.com>
Date: Wed, 8 May 2024 09:32:01 -0700
Subject: [PATCH 2/2] Fix the deployment doc and virtual node installation

Signed-off-by: Yuan Chen <yuanc@nvidia.com>

Use values-example.yaml to deploy nodes

Revert changes to values.yaml for virtual-nodes

Update deployment doc
---
 docs/deployment.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/deployment.md b/docs/deployment.md
index 2eaf5e6..ae354d3 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -49,17 +49,17 @@ kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/ch
 
 For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file.
 
-Currently, the system supports the following node types:
+Currently, the system includes the following node types:
 - [dgxa100.40g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
 - [dgxa100.80g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
 - [dgxh100.80g](https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html#hardware-overview)
 - cpu.x86
 
-If you need to introduce additional node types, update the `values.yaml` file with the necessary node information (such as type and count) and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
+If you need to introduce additional node types, update the values file used for node configuration with the node information (such as type and count), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
 
-To deploy these nodes, use the Helm command:
+To deploy the nodes in `values-example.yaml`, use the Helm command:
 ```bash
-helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values.yaml
+helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml
 ```
 
 ## Running Knavigator