From b989dcd50d1b0feb8ffa94ef090b28d4ff78c459 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 8 May 2024 08:44:04 -0700 Subject: [PATCH 1/2] Add test for inject failure to init containers Signed-off-by: Yuan Chen Update failure test Create job failure test for k8s Signed-off-by: Yuan Chen --- resources/templates/k8s/failed-job.yml | 40 +++++++++++++++++++++++++ resources/tests/k8s/test-failed-job.yml | 37 +++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 resources/templates/k8s/failed-job.yml create mode 100644 resources/tests/k8s/test-failed-job.yml diff --git a/resources/templates/k8s/failed-job.yml b/resources/templates/k8s/failed-job.yml new file mode 100644 index 0000000..ad011a0 --- /dev/null +++ b/resources/templates/k8s/failed-job.yml @@ -0,0 +1,40 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{._NAME_}}" + namespace: "{{.namespace}}" +spec: + backoffLimit: {{.backoffLimit}} + completions: {{.completions}} + parallelism: {{.parallelism}} + completionMode: {{.completionMode}} + template: + metadata: + labels: + pod-init-container-running-failed.stage.kwok.x-k8s.io: "true" + annotations: + pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: "{{.failureContainerName}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: "{{.failureReason}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "{{.failureMessage}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "{{.failureExitCode}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "{{.failureDelay}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "{{.failureJitterDelay}}" + spec: + schedulerName: default-scheduler + initContainers: + - name: {{.failureContainerName}} + image: {{.initContainerImage}} + containers: + - name: test + image: {{.containerImage}} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + restartPolicy: OnFailure diff --git a/resources/tests/k8s/test-failed-job.yml b/resources/tests/k8s/test-failed-job.yml new file mode 100644 index 0000000..53e6b67 --- /dev/null +++ b/resources/tests/k8s/test-failed-job.yml @@ -0,0 +1,37 @@ +name: test-k8s-job +description: submit and validate a k8s job +tasks: +- id: job + type: SubmitObj + params: + count: 1 + grv: + group: batch + version: v1 + resource: jobs + template: "resources/templates/k8s/failed-job.yml" + nameformat: "job{{._ENUM_}}" + overrides: + namespace: default + parallelism: 1 + completions: 1 + backoffLimit: 0 + completionMode: NonIndexed + image: ubuntu + cpu: 100m + memory: 512M + gpu: 8 + containerImage: ubuntu + initContainerImage: nccl + failureContainerName: nccl-test + failureReason: nccl-test-failed + failureMessage: "nccl test failed" + failureExitCode: 1 + failureDelay: 1000 + failureJitterDelay: 5000 +- id: status + type: CheckPod + params: + refTaskId: job + status: Failed + timeout: 10s From 1dbde6b3b51e9616379f6270e6bb58464496ca01 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 8 May 2024 09:32:01 -0700 Subject: [PATCH 2/2] Fix the deployment doc and virtual node installation Signed-off-by: Yuan Chen Use values-example.yaml to deploy nodes Revert changes to values.yaml for virtual-nodes Update deployment doc --- docs/deployment.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/deployment.md b/docs/deployment.md index 2eaf5e6..ae354d3 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -49,17 +49,17 @@ kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/ch For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file. -Currently, the system supports the following node types: +Currently, the system includes the following node types: - [dgxa100.40g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview) - [dgxa100.80g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview) - [dgxh100.80g](https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html#hardware-overview) - cpu.x86 -If you need to introduce additional node types, update the `values.yaml` file with the necessary node information (such as type and count) and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file. +If you need to introduce additional node types, update the values file used for node configuration with the node information (such as type and count), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file. -To deploy these nodes, use the Helm command: +To deploy the nodes in `values-example.yaml`, use the Helm command: ```bash -helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values.yaml +helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml ``` ## Running Knavigator