From b8c6b9abfe5e6964bd21e9f3d4ae0b99330f2aa1 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Fri, 29 Dec 2023 09:41:00 +0000 Subject: [PATCH] add deepspeed example --- examples/v2beta1/deepspeed/README.MD | 8 +++++ .../deepspeed/deepspeed-helloworld.yaml | 32 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 examples/v2beta1/deepspeed/README.MD create mode 100644 examples/v2beta1/deepspeed/deepspeed-helloworld.yaml diff --git a/examples/v2beta1/deepspeed/README.MD b/examples/v2beta1/deepspeed/README.MD new file mode 100644 index 00000000..7128f1b6 --- /dev/null +++ b/examples/v2beta1/deepspeed/README.MD @@ -0,0 +1,8 @@ +# DeeepSpeed Example + +This demo introduces the basic usage of deepspeed with mpi-operator. + +## References + +* https://github.com/microsoft/DeepSpeedExamples/blob/master/training/HelloDeepSpeed/README.md +* https://www.alibabacloud.com/help/en/ack/cloud-native-ai-suite/user-guide/deepspeed-distributed-training diff --git a/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml b/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml new file mode 100644 index 00000000..9bd863dd --- /dev/null +++ b/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml @@ -0,0 +1,32 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: deepspeed-helloworld +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed + name: deepspeed-helloworld + command: + - deepspeed + args: + - /workspace/DeepSpeedExamples/HelloDeepSpeed/train_bert_ds.py + - --checkpoint_dir + - /workspace + Worker: + replicas: 2 + template: + spec: + containers: + - image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed + name: deepspeed-helloworld + resources: + limits: + nvidia.com/gpu: 8