Skip to content

Commit

Permalink
Added distribute-training-bert-example and upgraded to vk 1.11
Browse files Browse the repository at this point in the history
  • Loading branch information
malvag committed Jul 29, 2024
1 parent e68f125 commit 45cd05b
Show file tree
Hide file tree
Showing 20 changed files with 2,014 additions and 514 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ hpk-pause:
GOOS=linux GOARCH=amd64 CGO_ENABLED=1 go build $(VERSION_FLAGS) -race -ldflags '-extldflags "-static"' -o bin/hpk-pause ./cmd/pause

docker-pause:
DOCKER_BUILDKIT=1 docker build . -t malvag/pause:apptainer -f deploy/images/pause-apptainer-agent/pause.apptainer.Dockerfile
DOCKER_BUILDKIT=1 docker build . -t malvag/pause:1.1.9 -f deploy/images/pause-apptainer-agent/pause.apptainer.Dockerfile
sudo docker push malvag/pause:1.1.9

##@ Deployment

Expand Down
2 changes: 2 additions & 0 deletions cmd/hpk/commands/root/admission.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ func AddAdmissionWebhooks(c Opts, virtualk8s *provider.VirtualK8S) {
RunInContainer: virtualk8s.RunInContainer,
GetContainerLogs: virtualk8s.GetContainerLogs,
GetPods: virtualk8s.GetPods,
PortForward: virtualk8s.PortForward,
// AttachToContainer: ,
// GetPodsFromKubernetes: func(context.Context) ([]*corev1.Pod, error) {
// return k8sclientset.CoreV1().Pods(c.KubeNamespace).List(ctx, labels.Everything())
// },
Expand Down
2 changes: 1 addition & 1 deletion compute/image/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ func (t Transport) Wrap(imageName string) string {
}

// const PauseImage = "icsforth/pause:apptainer"
const PauseImage = "malvag/pause:apptainer"
const PauseImage = "malvag/pause:1.1.9"
1 change: 1 addition & 0 deletions compute/podhandler/podhandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ func CreatePod(ctx context.Context, pod *corev1.Pod, watcher filenotify.FileWatc
jobID, err := slurm.SubmitJob(scriptFilePath)
if err != nil {
compute.SystemPanic(err, "failed to submit job")
//[TODO:] update pod status with insufficient resources
}

logger.Info(" * Slurm job has been submitted", "jobID", jobID)
Expand Down
1 change: 1 addition & 0 deletions compute/podhandler/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ exec {{$.HostEnv.ApptainerBin}} exec --containall --net --fakeroot --scratch /sc
{{- end}}
--env PARENT=${PPID} \
--bind $HOME/.k8sfs/kubernetes:/k8s-data \
--bind /etc/apptainer/apptainer.conf \
--bind $HOME,/tmp \
--hostname {{.Pod.Name}} \
{{$.PauseImageFilePath}} /usr/local/bin/hpk-pause -namespace {{.Pod.Namespace}} -pod {{.Pod.Name}} ||
Expand Down
21 changes: 21 additions & 0 deletions examples/apps/bert-distr-training/global_notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Notes

1. minikube start --memory='4000' --cpus='4' --disk-size='50000mb' --driver=kvm2 --nodes 3
2. deploy nfs
3. install minio
1. kubectl create ns minio
2. kubectl apply -f storage.yaml
3. ./install.sh
4. kubectl apply --wait=true -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0" # install kubeflow
5. install jhub
1. kubectl create ns kubeflow
2. kubectl apply -f storage.yaml
3. helm upgrade --cleanup-on-fail --install my-jupyter jupyterhub/jupyterhub --namespace kubeflow --create-namespace --values values.yaml
6. replace minio access key and secret key on notebook
7. run notebook


## Scratch

tChAGp5qbzy7SP80HruR
CYGcLfX9tgD6r0NQoI78VuHgr39sEehyiby0jy8w
45 changes: 45 additions & 0 deletions examples/apps/bert-distr-training/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

#install minio
pushd minio
kubectl create ns minio
kubectl apply -f hostpath-storage.yaml
./install.sh
popd

# install kubeflow operator
kubectl apply --wait=true -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0"

pushd jhub
kubectl create ns kubeflow
kubectl apply -f hostpath-storage.yaml
helm upgrade --cleanup-on-fail --install my-jupyter jupyterhub/jupyterhub --namespace kubeflow --create-namespace --values values.yaml

MC_ACCESS_KEY=$(kubectl get secret myminio -n minio -o jsonpath="{.data.rootUser}" | base64 --decode)
MC_SECRET_KEY=$(kubectl get secret myminio -n minio -o jsonpath="{.data.rootPassword}" | base64 --decode)
echo "MinIO credentials: $MC_ACCESS_KEY $MC_SECRET_KEY"
popd

if [ -f "./mc" ]; then
echo "Minio client exists."
else
echo "Minio client doesnt exist, downloading...."
wget https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x mc
fi

MINIO_SERVICE_NAME="myminio"

# Get the ClusterIP using kubectl and JSONPath for precise output
ENDPOINT=$(kubectl get endpoints $MINIO_SERVICE_NAME -n minio -o jsonpath='{.subsets[0].addresses[0].ip}')

# Check if the endpoint was retrieved successfully
if [[ -z "$ENDPOINT" ]]; then
echo "Error: Could not get the first endpoint for MinIO service"
exit 1
fi


./mc alias set local http://$ENDPOINT:9000 $MC_ACCESS_KEY $MC_SECRET_KEY
./mc mb local/kubeflow-examples
# create bucket "kubeflow-examples" through minio-console
24 changes: 24 additions & 0 deletions examples/apps/bert-distr-training/jhub/hostpath-storage.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: nfs-share-pv
spec:
capacity:
storage: 40Gi
accessModes:
- ReadWriteMany
hostPath:
path: /home/malvag/kubeflow_testbed/jhub_data
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: nfs-share-pvc
namespace: kubeflow
spec:
storageClassName: ""
accessModes:
- ReadWriteMany
resources:
requests:
storage: 40Gi
34 changes: 34 additions & 0 deletions examples/apps/bert-distr-training/jhub/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
hub:
config:
JupyterHub:
hub_connect_url: "http://hub:8081"
ConfigurableHTTPProxy:
api_url: "http://proxy-api:8001"
db:
type: "sqlite-memory"
networkPolicy:
enabled: false
prePuller:
hook:
enabled: false
proxy:
service:
type: ClusterIP
https:
enabled: true
chp:
networkPolicy:
enabled: false
singleuser:
serviceAccountName: training-operator
storage:
dynamic:
storageClass: openebs-hostpath
capacity: 30Gi
# type: "static"
# capacity: "10Gi"
# static:
# pvcName: "my-storage-pvc"
# subPath: ""
networkPolicy:
enabled: false
24 changes: 24 additions & 0 deletions examples/apps/bert-distr-training/minio/hostpath-storage.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: nfs-share-pv-minio
spec:
capacity:
storage: 40Gi
accessModes:
- ReadWriteMany
hostPath:
path: /home/malvag/kubeflow_testbed/minio_data
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: nfs-share-pvc-minio
namespace: minio
spec:
storageClassName: ""
accessModes:
- ReadWriteMany
resources:
requests:
storage: 20Gi
18 changes: 18 additions & 0 deletions examples/apps/bert-distr-training/minio/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
helm repo add minio https://charts.min.io/
helm repo update

TEST_NAMESPACE=minio

# Install Minio
helm install --debug --wait \
my-minio minio/minio \
--namespace "${TEST_NAMESPACE}" \
--set resources.requests.memory=512Mi \
--set replicas=1 \
--set persistence.enabled=false \
--set mode=standalone \
--set fullnameOverride=myminio
# Extract Minio Credentials
ACCESS_KEY=$(kubectl get secret myminio -n "${TEST_NAMESPACE}" -o jsonpath="{.data.rootUser}" | base64 --decode)
SECRET_KEY=$(kubectl get secret myminio -n "${TEST_NAMESPACE}" -o jsonpath="{.data.rootPassword}" | base64 --decode)
echo "MinIO credentials: $ACCESS_KEY $SECRET_KEY"
Loading

0 comments on commit 45cd05b

Please sign in to comment.