diff --git a/docs/deployment.md b/docs/deployment.md index 2eaf5e6..ae354d3 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -49,17 +49,17 @@ kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/ch For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file. -Currently, the system supports the following node types: +Currently, the system includes the following node types: - [dgxa100.40g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview) - [dgxa100.80g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview) - [dgxh100.80g](https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html#hardware-overview) - cpu.x86 -If you need to introduce additional node types, update the `values.yaml` file with the necessary node information (such as type and count) and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file. +If you need to introduce additional node types, update the values file used for node configuration with the node information (such as type and count), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file. -To deploy these nodes, use the Helm command: +To deploy the nodes in `values-example.yaml`, use the Helm command: ```bash -helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values.yaml +helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml ``` ## Running Knavigator diff --git a/resources/templates/k8s/failed-job.yml b/resources/templates/k8s/failed-job.yml new file mode 100644 index 0000000..ad011a0 --- /dev/null +++ b/resources/templates/k8s/failed-job.yml @@ -0,0 +1,40 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{._NAME_}}" + namespace: "{{.namespace}}" +spec: + backoffLimit: {{.backoffLimit}} + completions: {{.completions}} + parallelism: {{.parallelism}} + completionMode: {{.completionMode}} + template: + metadata: + labels: + pod-init-container-running-failed.stage.kwok.x-k8s.io: "true" + annotations: + pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: "{{.failureContainerName}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: "{{.failureReason}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "{{.failureMessage}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "{{.failureExitCode}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "{{.failureDelay}}" + pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "{{.failureJitterDelay}}" + spec: + schedulerName: default-scheduler + initContainers: + - name: {{.failureContainerName}} + image: {{.initContainerImage}} + containers: + - name: test + image: {{.containerImage}} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + restartPolicy: OnFailure diff --git a/resources/tests/k8s/test-failed-job.yml b/resources/tests/k8s/test-failed-job.yml new file mode 100644 index 0000000..53e6b67 --- /dev/null +++ b/resources/tests/k8s/test-failed-job.yml @@ -0,0 +1,37 @@ +name: test-k8s-job +description: submit and validate a k8s job +tasks: +- id: job + type: SubmitObj + params: + count: 1 + grv: + group: batch + version: v1 + resource: jobs + template: "resources/templates/k8s/failed-job.yml" + nameformat: "job{{._ENUM_}}" + overrides: + namespace: default + parallelism: 1 + completions: 1 + backoffLimit: 0 + completionMode: NonIndexed + image: ubuntu + cpu: 100m + memory: 512M + gpu: 8 + containerImage: ubuntu + initContainerImage: nccl + failureContainerName: nccl-test + failureReason: nccl-test-failed + failureMessage: "nccl test failed" + failureExitCode: 1 + failureDelay: 1000 + failureJitterDelay: 5000 +- id: status + type: CheckPod + params: + refTaskId: job + status: Failed + timeout: 10s