diff --git a/Dockerfile b/Dockerfile index dd8b88c1f..3a1f637f7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -FROM golang:1.12.6-alpine3.10 AS build +FROM golang:1.12.7-alpine3.10 AS build WORKDIR /go/src/github.com/kubeflow/mpi-operator/ COPY . /go/src/github.com/kubeflow/mpi-operator/ -RUN go build -o /bin/mpi-operator github.com/kubeflow/mpi-operator/cmd/mpi-operator.v1alpha1 +RUN go build -o /bin/mpi-operator github.com/kubeflow/mpi-operator/cmd/mpi-operator.v1alpha2 FROM alpine:3.10 COPY --from=build /bin/mpi-operator /bin/mpi-operator diff --git a/README.md b/README.md index ac1165912..b2327ac60 100644 --- a/README.md +++ b/README.md @@ -39,20 +39,21 @@ ks apply ${ENVIRONMENT} -c mpi-operator Alternatively, you can deploy the operator with default settings without using ksonnet by running the following from the repo: ```shell +kubectl create -f deploy/crd/crd-v1alpha2.yaml kubectl create -f deploy/ ``` ## Creating an MPI Job -You can create an MPI job by defining an `MPIJob` config file. See [Tensorflow benchmark example](https://github.com/kubeflow/mpi-operator/blob/master/examples/tensorflow-benchmarks.yaml) config file for launching a multi-node TensorFlow benchmark training job. You may change the config file based on your requirements. +You can create an MPI job by defining an `MPIJob` config file. See [Tensorflow benchmark example](https://github.com/kubeflow/mpi-operator/blob/master/examples/v1alpha2/tensorflow-benchmarks.yaml) config file for launching a multi-node TensorFlow benchmark training job. You may change the config file based on your requirements. ``` -cat examples/tensorflow-benchmarks.yaml +cat examples/v1alpha2/tensorflow-benchmarks.yaml ``` Deploy the `MPIJob` resource to start training: ``` -kubectl create -f examples/tensorflow-benchmarks.yaml +kubectl create -f examples/v1alpha2/tensorflow-benchmarks.yaml ``` ## Monitoring an MPI Job @@ -60,45 +61,105 @@ kubectl create -f examples/tensorflow-benchmarks.yaml Once the `MPIJob` resource is created, you should now be able to see the created pods matching the specified number of GPUs. You can also monitor the job status from the status section. Here is sample output when the job is successfully completed. ``` -kubectl get -o yaml mpijobs tensorflow-benchmarks-16 +kubectl get -o yaml mpijobs tensorflow-benchmarks ``` ``` -apiVersion: kubeflow.org/v1alpha1 +apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: - clusterName: "" - creationTimestamp: 2019-01-07T20:32:12Z + creationTimestamp: "2019-07-09T22:15:51Z" generation: 1 - name: tensorflow-benchmarks-16 + name: tensorflow-benchmarks namespace: default - resourceVersion: "185051397" - selfLink: /apis/kubeflow.org/v1alpha1/namespaces/default/mpijobs/tensorflow-benchmarks-16 - uid: 8dc8c044-127d-11e9-a419-02420bbe29f3 + resourceVersion: "5645868" + selfLink: /apis/kubeflow.org/v1alpha2/namespaces/default/mpijobs/tensorflow-benchmarks + uid: 1c5b470f-a297-11e9-964d-88d7f67c6e6d spec: - gpus: 16 - template: - metadata: - creationTimestamp: null - spec: - containers: - - image: mpioperator/tensorflow-benchmarks:latest - name: tensorflow-benchmarks - resources: {} + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py + - --model=resnet101 + - --batch_size=64 + - --variable_update=horovod + image: mpioperator/tensorflow-benchmarks:latest + name: tensorflow-benchmarks + Worker: + replicas: 1 + template: + spec: + containers: + - image: mpioperator/tensorflow-benchmarks:latest + name: tensorflow-benchmarks + resources: + limits: + nvidia.com/gpu: 2 + slotsPerWorker: 2 status: - launcherStatus: Succeeded + completionTime: "2019-07-09T22:17:06Z" + conditions: + - lastTransitionTime: "2019-07-09T22:15:51Z" + lastUpdateTime: "2019-07-09T22:15:51Z" + message: MPIJob default/tensorflow-benchmarks is created. + reason: MPIJobCreated + status: "True" + type: Created + - lastTransitionTime: "2019-07-09T22:15:54Z" + lastUpdateTime: "2019-07-09T22:15:54Z" + message: MPIJob default/tensorflow-benchmarks is running. + reason: MPIJobRunning + status: "False" + type: Running + - lastTransitionTime: "2019-07-09T22:17:06Z" + lastUpdateTime: "2019-07-09T22:17:06Z" + message: MPIJob default/tensorflow-benchmarks successfully completed. + reason: MPIJobSucceeded + status: "True" + type: Succeeded + replicaStatuses: + Launcher: + succeeded: 1 + Worker: {} + startTime: "2019-07-09T22:15:51Z" ``` Training should run for 100 steps and takes a few minutes on a GPU cluster. You can inspect the logs to see the training progress. When the job starts, access the logs from the `launcher` pod: ``` -PODNAME=$(kubectl get pods -l mpi_job_name=tensorflow-benchmarks-16,mpi_role_type=launcher -o name) +PODNAME=$(kubectl get pods -l mpi_job_name=tensorflow-benchmarks,mpi_role_type=launcher -o name) kubectl logs -f ${PODNAME} ``` ``` -TensorFlow: 1.10 +TensorFlow: 1.14 Model: resnet101 Dataset: imagenet (synthetic) Mode: training @@ -108,32 +169,29 @@ Batch size: 128 global Num batches: 100 Num epochs: 0.01 Devices: ['horovod/gpu:0', 'horovod/gpu:1'] +NUMA bind: False Data format: NCHW Optimizer: sgd Variables: horovod ... -40 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.146 -40 images/sec: 132.1 +/- 0.0 (jitter = 0.1) 9.182 -50 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.071 -50 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.210 -60 images/sec: 132.2 +/- 0.0 (jitter = 0.2) 9.180 -60 images/sec: 132.2 +/- 0.0 (jitter = 0.2) 9.055 -70 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.005 -70 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.096 -80 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.231 -80 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.197 -90 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.201 -90 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.089 -100 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.183 ----------------------------------------------------------------- -total images/sec: 264.26 ----------------------------------------------------------------- -100 images/sec: 132.1 +/- 0.0 (jitter = 0.2) 9.044 ----------------------------------------------------------------- -total images/sec: 264.26 +40 images/sec: 154.4 +/- 0.7 (jitter = 4.0) 8.280 +40 images/sec: 154.4 +/- 0.7 (jitter = 4.1) 8.482 +50 images/sec: 154.8 +/- 0.6 (jitter = 4.0) 8.397 +50 images/sec: 154.8 +/- 0.6 (jitter = 4.2) 8.450 +60 images/sec: 154.5 +/- 0.5 (jitter = 4.1) 8.321 +60 images/sec: 154.5 +/- 0.5 (jitter = 4.4) 8.349 +70 images/sec: 154.5 +/- 0.5 (jitter = 4.0) 8.433 +70 images/sec: 154.5 +/- 0.5 (jitter = 4.4) 8.430 +80 images/sec: 154.8 +/- 0.4 (jitter = 3.6) 8.199 +80 images/sec: 154.8 +/- 0.4 (jitter = 3.8) 8.404 +90 images/sec: 154.6 +/- 0.4 (jitter = 3.7) 8.418 +90 images/sec: 154.6 +/- 0.4 (jitter = 3.6) 8.459 +100 images/sec: 154.2 +/- 0.4 (jitter = 4.0) 8.372 +100 images/sec: 154.2 +/- 0.4 (jitter = 4.0) 8.542 ---------------------------------------------------------------- +total images/sec: 308.27 ``` # Docker Images diff --git a/cmd/kubectl-delivery/Dockerfile b/cmd/kubectl-delivery/Dockerfile index 1aca029c6..a7fd5e4d4 100644 --- a/cmd/kubectl-delivery/Dockerfile +++ b/cmd/kubectl-delivery/Dockerfile @@ -1,13 +1,13 @@ -FROM alpine:3.8 AS build +FROM alpine:3.10 AS build # Install kubectl. -ENV K8S_VERSION v1.13.2 +ENV K8S_VERSION v1.15.0 RUN apk add --no-cache wget RUN wget -q https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl RUN chmod +x ./kubectl RUN mv ./kubectl /bin/kubectl -FROM alpine:3.8 +FROM alpine:3.10 COPY --from=build /bin/kubectl /bin/kubectl COPY deliver_kubectl.sh . ENTRYPOINT ["./deliver_kubectl.sh"] diff --git a/cmd/mpi-operator.v1alpha2/app/options/options.go b/cmd/mpi-operator.v1alpha2/app/options/options.go index d86851ec7..b402c86e8 100644 --- a/cmd/mpi-operator.v1alpha2/app/options/options.go +++ b/cmd/mpi-operator.v1alpha2/app/options/options.go @@ -50,8 +50,8 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) { "The container image used to deliver the kubectl binary.") fs.StringVar(&s.Namespace, "namespace", v1.NamespaceAll, - `The namespace to monitor tfjobs. If unset, it monitors all namespaces cluster-wide. - If set, it only monitors tfjobs in the given namespace.`) + `The namespace to monitor mpijobs. If unset, it monitors all namespaces cluster-wide. + If set, it only monitors mpijobs in the given namespace.`) fs.IntVar(&s.Threadiness, "threadiness", 2, `How many threads to process the main logic`) diff --git a/deploy/2-rbac.yaml b/deploy/2-rbac.yaml index 9fe9fd7b7..a8a470530 100644 --- a/deploy/2-rbac.yaml +++ b/deploy/2-rbac.yaml @@ -26,6 +26,14 @@ rules: - pods/exec verbs: - create +- apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - get + - update - apiGroups: - "" resources: @@ -80,6 +88,7 @@ rules: - kubeflow.org resources: - mpijobs + - mpijobs/status verbs: - "*" --- diff --git a/deploy/3-mpi-operator.yaml b/deploy/3-mpi-operator.yaml index 26d78e6ab..741be0c9b 100644 --- a/deploy/3-mpi-operator.yaml +++ b/deploy/3-mpi-operator.yaml @@ -21,7 +21,6 @@ spec: image: mpioperator/mpi-operator:latest args: [ "-alsologtostderr", - "--gpus-per-node", "8", "--kubectl-delivery-image", "mpioperator/kubectl-delivery:latest" ] diff --git a/examples/tensorflow-benchmarks/Dockerfile b/examples/tensorflow-benchmarks/Dockerfile index 568b3e3ad..6f99d99f3 100644 --- a/examples/tensorflow-benchmarks/Dockerfile +++ b/examples/tensorflow-benchmarks/Dockerfile @@ -1,16 +1,8 @@ -FROM uber/horovod:0.15.2-tf1.12.0-torch1.0.0-py2.7 - -# Temporary fix until Horovod pushes out a new release. -# See https://github.com/uber/horovod/pull/700 -RUN sed -i '/^NCCL_SOCKET_IFNAME.*/d' /etc/nccl.conf +FROM horovod/horovod:0.16.4-tf1.14.0-torch1.1.0-mxnet1.4.1-py3.6 RUN mkdir /tensorflow WORKDIR "/tensorflow" -RUN git clone -b cnn_tf_v1.12_compatible https://github.com/tensorflow/benchmarks +RUN git clone https://github.com/tensorflow/benchmarks WORKDIR "/tensorflow/benchmarks" -CMD mpirun \ - python scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \ - --model resnet101 \ - --batch_size 64 \ - --variable_update horovod +CMD ["/bin/bash"] diff --git a/examples/tensorflow-benchmarks-custom.yaml b/examples/v1alpha1/tensorflow-benchmarks-custom.yaml similarity index 100% rename from examples/tensorflow-benchmarks-custom.yaml rename to examples/v1alpha1/tensorflow-benchmarks-custom.yaml diff --git a/examples/tensorflow-benchmarks-imagenet.yaml b/examples/v1alpha1/tensorflow-benchmarks-imagenet.yaml similarity index 100% rename from examples/tensorflow-benchmarks-imagenet.yaml rename to examples/v1alpha1/tensorflow-benchmarks-imagenet.yaml diff --git a/examples/tensorflow-benchmarks.yaml b/examples/v1alpha1/tensorflow-benchmarks.yaml similarity index 100% rename from examples/tensorflow-benchmarks.yaml rename to examples/v1alpha1/tensorflow-benchmarks.yaml diff --git a/examples/v1alpha2/tensorflow-benchmarks-custom.yaml b/examples/v1alpha2/tensorflow-benchmarks-custom.yaml deleted file mode 100644 index 15895ba52..000000000 --- a/examples/v1alpha2/tensorflow-benchmarks-custom.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# This file shows how to run multi-node training benchmarks using an MPIJob, -# specifying GPUs explicitly per worker. -apiVersion: kubeflow.org/v1alpha2 -kind: MPIJob -metadata: - name: tensorflow-benchmarks-16-custom -spec: - slotsPerWorker: 4 - mpiReplicaSpecs: - Launcher: - replicas: 1 - template: - spec: - containers: - - image: mpioperator/tensorflow-benchmarks:latest - name: tensorflow-benchmarks - Worker: - replicas: 4 - template: - spec: - containers: - - image: mpioperator/tensorflow-benchmarks:latest - name: tensorflow-benchmarks - resources: - limits: - nvidia.com/gpu: 4 \ No newline at end of file diff --git a/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml b/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml index cac1637f3..412b6478c 100644 --- a/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml +++ b/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml @@ -8,6 +8,8 @@ kind: MPIJob metadata: name: tensorflow-benchmarks-imagenet spec: + slotsPerWorker: 8 + cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 @@ -18,6 +20,25 @@ spec: name: tensorflow-benchmarks command: - mpirun + - --allow-run-as-root + - -np + - "16" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib - python - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py - --data_format=NCHW @@ -39,6 +60,9 @@ spec: containers: - image: mpioperator/tensorflow-benchmarks:latest name: tensorflow-benchmarks + resources: + limits: + nvidia.com/gpu: 8 volumeMounts: - mountPath: /efs name: efs diff --git a/examples/v1alpha2/tensorflow-benchmarks.yaml b/examples/v1alpha2/tensorflow-benchmarks.yaml index 7eb1bc3e4..a7f002a57 100644 --- a/examples/v1alpha2/tensorflow-benchmarks.yaml +++ b/examples/v1alpha2/tensorflow-benchmarks.yaml @@ -1,9 +1,10 @@ apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: - name: tensorflow-benchmarks-16 + name: tensorflow-benchmarks spec: slotsPerWorker: 1 + cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 @@ -12,6 +13,32 @@ spec: containers: - image: mpioperator/tensorflow-benchmarks:latest name: tensorflow-benchmarks + command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py + - --model=resnet101 + - --batch_size=64 + - --variable_update=horovod Worker: replicas: 2 template: @@ -19,4 +46,6 @@ spec: containers: - image: mpioperator/tensorflow-benchmarks:latest name: tensorflow-benchmarks - + resources: + limits: + nvidia.com/gpu: 1