From b9141c05407f55db053918fb6069b3b24ef6c4c1 Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor <1299064+alculquicondor@users.noreply.github.com> Date: Wed, 1 Sep 2021 11:04:45 -0400 Subject: [PATCH] Preparing release of v0.3.0 (#414) Also - Updated Makefile to use new version - extra notes for developers --- CONTRIBUTING.md | 70 +++++++++++++++++---- Makefile | 6 +- README.md | 47 ++++++++++---- RELEASE.md | 15 +++++ examples/horovod/tensorflow-mnist.yaml | 5 +- examples/pi/README.md | 11 +++- examples/pi/pi-intel.yaml | 4 +- examples/pi/pi.yaml | 4 +- examples/v2beta1/tensorflow-benchmarks.yaml | 52 +++++++++++++++ 9 files changed, 180 insertions(+), 34 deletions(-) create mode 100644 examples/v2beta1/tensorflow-benchmarks.yaml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 419de41e6..7b9051f7b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,34 +14,78 @@ git clone https://github.com/${GITHUB_USER}/mpi-operator.git ## Install Dependencies -We use Go v1.13+ for development and use [Go Modules](https://blog.golang.org/using-go-modules) to download and install the dependencies. +We use Go v1.15+ for development and use [Go Modules](https://blog.golang.org/using-go-modules) to download and install the dependencies. -## Run Unit Test +## Controller versions -You can execute all the unit tests via `go test ./...`. +The main module `github.com/kubeflow/mpi-operator` contains the code of the legacy +controllers `v1alpha1`, `v1alpha2` and `v1`. + +The newest iteration of the controller is in the module `github.com/kubeflow/mpi-operator/v2`. + +## Run tests + +### Unit and integration tests + +You can execute all the unit and integration tests via `make test`. + +If you only which to run the tests for the v2 controller, you can run `make test_v2`. + +You can find the unit tests in the same folders as the functional code. + +You can find the integration tests in a separate directory, `v2/test/integration`. +Integration tests make use of a real kube-apiserver to test the interaction of +the controller with a real Kubernetes API. In these tests, other components +are not running, including `kubelet` or `kube-controller-manager`. + +Consider adding an integration test if your feature makes new API calls. + +### E2E tests + +E2E tests run against a real cluster. In our tests, we create a cluster using +[kind](https://kind.sigs.k8s.io/docs/user/quick-start/). + +You can run the tests with `make test_e2e`. + +If desired, you can run the tests against any existing cluster. Just make sure +that credentials for the cluster are present in `${HOME}/.kube/config` and run: + +```bash +USE_EXISTING_CLUSTER=true make test_e2e +``` ## Check Code Style -We use [golangci-lint](https://github.com/golangci/golangci-lint) to check issues on code style. Please also check out [this wiki](https://github.com/golang/go/wiki/CodeReviewComments) for some additional instructions on code review. +We use [golangci-lint](https://github.com/golangci/golangci-lint) to check issues on code style. +Please also check out [this wiki](https://github.com/golang/go/wiki/CodeReviewComments) for some additional instructions on code review. + +You can run formatter and linter with: + +```bash +make fmt lint +``` ## Run You have to build the image and deploy the standalone YAMLs in a cluster. -```shell -CONTROLLER_VERSION=v1 RELEASE_VERSION=latest make images -kubectl apply -k manifests/overlays/standalone +```bash +make images dev_manifest +kubectl apply -k manifests/overlays/dev ``` -If you need to use a different registry, you can do: +You can build an image using a legacy controller by setting `CONTROLLER_VERSION` -```shell -IMAGE_NAME=example.com/mpi-operator CONTROLLER_VERSION=v1 RELEASE_VERSION=latest make images +```bash +make CONTROLLER_VERSION=v1 images dev_manifest +kubectl apply -k manifests/overlays/dev ``` -Next, modify the line `newName` in `manifests/overlays/standalone/kustomization.yaml` -to match the image name. After pushing the image to the registry, you can apply -the YAMLs the same way as before. +If you need to use a different registry, or a different tag, you can do: + +```bash +make IMAGE_NAME=example.com/mpi-operator CONTROLLER_VERSION=v1 RELEASE_VERSION=dev make images dev_manifest +``` To look at the controller's logs, you can do: diff --git a/Makefile b/Makefile index c54f9d501..24e9a71d6 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ REPO_PATH="github.com/kubeflow/mpi-operator" REL_OSARCH="linux/amd64" GitSHA=`git rev-parse HEAD` Date=`date "+%Y-%m-%d %H:%M:%S"` -RELEASE_VERSION?=v0.2.2 -CONTROLLER_VERSION?=v1alpha2 +RELEASE_VERSION?=v0.3.0 +CONTROLLER_VERSION?=v2 IMG_BUILDER=docker LD_FLAGS=" \ -X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \ @@ -14,7 +14,7 @@ LD_FLAGS_V2=" \ -X '${REPO_PATH}/v2/pkg/version.GitSHA=${GitSHA}' \ -X '${REPO_PATH}/v2/pkg/version.Built=${Date}' \ -X '${REPO_PATH}/v2/pkg/version.Version=${RELEASE_VERSION}'" -IMAGE_NAME?=kubeflow/mpi-operator +IMAGE_NAME?=mpioperator/mpi-operator KUBEBUILDER_ASSETS_PATH := $(dir $(abspath $(firstword $(MAKEFILE_LIST))))bin/kubebuilder/bin KIND_VERSION=v0.11.1 # This kubectl version supports -k for kustomization. diff --git a/README.md b/README.md index 2809f0f2b..b325e6e54 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ You can deploy the operator with default settings by running the following comma ```shell git clone https://github.com/kubeflow/mpi-operator cd mpi-operator -kubectl create -f deploy/v1alpha2/mpi-operator.yaml +kubectl apply -f deploy/v2beta1/mpi-operator.yaml ``` Alternatively, follow the [getting started guide](https://www.kubeflow.org/docs/started/getting-started/) to deploy Kubeflow. @@ -34,16 +34,22 @@ mpijobs.kubeflow.org 4d ... ``` -If it is not included you can add it as follows using [kustomize](https://github.com/kubernetes-sigs/kustomize): +If it is not included, you can add it as follows using [kustomize](https://github.com/kubernetes-sigs/kustomize): ```bash git clone https://github.com/kubeflow/mpi-operator -cd mpi-operator/manifests -kustomize build overlays/kubeflow | kubectl apply -f - +cd mpi-operator +kustomize build manifests/overlays/kubeflow | kubectl apply -f - ``` Note that since Kubernetes v1.14, `kustomize` became a subcommand in `kubectl` so you can also run the following command instead: +Since Kubernetes v1.21, you can use: + +```bash +kubectl apply -k manifests/overlays/kubeflow +``` + ```bash kubectl kustomize base | kubectl apply -f - ``` @@ -53,13 +59,13 @@ kubectl kustomize base | kubectl apply -f - You can create an MPI job by defining an `MPIJob` config file. See [TensorFlow benchmark example](https://github.com/kubeflow/mpi-operator/blob/master/examples/v1alpha2/tensorflow-benchmarks.yaml) config file for launching a multi-node TensorFlow benchmark training job. You may change the config file based on your requirements. ``` -cat examples/v1alpha2/tensorflow-benchmarks.yaml +cat examples/v2beta1/tensorflow-benchmarks.yaml ``` Deploy the `MPIJob` resource to start training: ``` -kubectl create -f examples/v1alpha2/tensorflow-benchmarks.yaml +kubectl apply -f examples/v2beta1/tensorflow-benchmarks.yaml ``` ## Monitoring an MPI Job @@ -71,7 +77,7 @@ kubectl get -o yaml mpijobs tensorflow-benchmarks ``` ``` -apiVersion: kubeflow.org/v1alpha2 +apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: creationTimestamp: "2019-07-09T22:15:51Z" @@ -82,7 +88,8 @@ metadata: selfLink: /apis/kubeflow.org/v1alpha2/namespaces/default/mpijobs/tensorflow-benchmarks uid: 1c5b470f-a297-11e9-964d-88d7f67c6e6d spec: - cleanPodPolicy: Running + runPolicy: + cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 @@ -199,6 +206,12 @@ Variables: horovod total images/sec: 308.27 ``` +For a sample that uses Intel MPI, see: + +```bash +cat examples/pi/pi-intel.yaml +``` + ## Exposed Metrics | Metric name | Metric type | Description | Labels | @@ -213,9 +226,21 @@ total images/sec: 308.27 With [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics), one can join metrics by labels. For example `kube_pod_info * on(pod,namespace) group_left label_replace(mpi_operator_job_infos, "pod", "$0", "launcher", ".*")` -# Docker Images +## Docker Images -Docker images are built and pushed automatically to [mpioperator on Dockerhub](https://hub.docker.com/u/mpioperator). You can use the following Dockerfiles to build the images yourself: +We push Docker images of [mpioperator on Dockerhub](https://hub.docker.com/u/mpioperator) for every release. +You can use the following Dockerfile to build the image yourself: - [mpi-operator](https://github.com/kubeflow/mpi-operator/blob/master/Dockerfile) -- [kubectl-delivery](https://github.com/kubeflow/mpi-operator/blob/master/cmd/kubectl-delivery/Dockerfile) + +Alternative, you can build the image using make: + +```bash +make RELEASE_VERSION=dev images +``` + +This will produce an image with the tag `kubeflow/mpi-operator:dev`. + +## Contributing + +Learn more in [CONTRIBUTING](https://github.com/kubeflow/mpi-operator/blob/master/CONTRIBUTING.md). \ No newline at end of file diff --git a/RELEASE.md b/RELEASE.md index c2af4e8dd..0136094a5 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,20 @@ # MPI Operator Releases +## Release v0.3.0 + +* Scalability improvements + * Worker start up no longer issues requests to kube-apiserver. + * Dropped kubectl-delivery init container, reducing stress on kube-apiserver. +* Support for Intel MPI. +* Support for `runPolicy` (`ttlSecondsAfterFinish`, `activeDeadlineSeconds`, `backoffLimit`) + by using a k8s Job for the launcher. +* Samples for plain MPI applications. +* Production readiness improvements: + * Increased coverage throughout unit, integration and E2E tests. + * More robust API validation. + * Revisited v2beta1 MPIJob API. + * Using fully-qualified label names, in consistency with other kubeflow operators. + ## Release v0.2.3 ### Enhancements diff --git a/examples/horovod/tensorflow-mnist.yaml b/examples/horovod/tensorflow-mnist.yaml index 49da7fd96..7fd67dcb3 100644 --- a/examples/horovod/tensorflow-mnist.yaml +++ b/examples/horovod/tensorflow-mnist.yaml @@ -1,10 +1,11 @@ -apiVersion: kubeflow.org/v1 +apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: tensorflow-mnist spec: slotsPerWorker: 1 - cleanPodPolicy: Running + runPolicy: + cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 diff --git a/examples/pi/README.md b/examples/pi/README.md index ad36a656d..f3e3aabbc 100644 --- a/examples/pi/README.md +++ b/examples/pi/README.md @@ -7,13 +7,22 @@ Then, it calculates an approximate value for pi. ## How to build Image +For OpenMPI: + ```bash docker build -t mpi-pi . ``` +For Intel MPI: + +```bash +docker build -t mpi-pi . -f intel.Dockerfile +``` + ## Create MPIJob -Modify `pi.yaml` to set up the image name from your own registry. +Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the +image name from your own registry. Then, run: diff --git a/examples/pi/pi-intel.yaml b/examples/pi/pi-intel.yaml index 580e37fa2..5efacddc4 100644 --- a/examples/pi/pi-intel.yaml +++ b/examples/pi/pi-intel.yaml @@ -14,7 +14,7 @@ spec: template: spec: containers: - - image: kubeflow/mpi-pi:intel + - image: mpioperator/mpi-pi:intel imagePullPolicy: Always name: mpi-launcher securityContext: @@ -33,7 +33,7 @@ spec: template: spec: containers: - - image: kubeflow/mpi-pi:intel + - image: mpioperator/mpi-pi:intel imagePullPolicy: Always name: mpi-worker securityContext: diff --git a/examples/pi/pi.yaml b/examples/pi/pi.yaml index 57e3ec344..aaeb97c42 100644 --- a/examples/pi/pi.yaml +++ b/examples/pi/pi.yaml @@ -14,7 +14,7 @@ spec: template: spec: containers: - - image: kubeflow/mpi-pi + - image: mpioperator/mpi-pi name: mpi-launcher securityContext: runAsUser: 1000 @@ -33,7 +33,7 @@ spec: template: spec: containers: - - image: kubeflow/mpi-pi + - image: mpioperator/mpi-pi name: mpi-worker securityContext: runAsUser: 1000 diff --git a/examples/v2beta1/tensorflow-benchmarks.yaml b/examples/v2beta1/tensorflow-benchmarks.yaml new file mode 100644 index 000000000..c5eff6460 --- /dev/null +++ b/examples/v2beta1/tensorflow-benchmarks.yaml @@ -0,0 +1,52 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: tensorflow-benchmarks +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: mpioperator/tensorflow-benchmarks:latest + name: tensorflow-benchmarks + command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py + - --model=resnet101 + - --batch_size=64 + - --variable_update=horovod + Worker: + replicas: 2 + template: + spec: + containers: + - image: mpioperator/tensorflow-benchmarks:latest + name: tensorflow-benchmarks + resources: + limits: + nvidia.com/gpu: 1