From caa3b1d928cbdb98a3c6304d579ab3778c072398 Mon Sep 17 00:00:00 2001 From: Mazen Selim Date: Sat, 25 Jan 2025 05:46:54 +0000 Subject: [PATCH] Enable worker <-> master collective communication --- .../neuron-training/manifests/neuron-training.yaml | 14 +++++--------- .../manifests/training-comm-service.yaml | 6 +----- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/test/cases/neuron-training/manifests/neuron-training.yaml b/test/cases/neuron-training/manifests/neuron-training.yaml index effc7f43f..032f0931b 100644 --- a/test/cases/neuron-training/manifests/neuron-training.yaml +++ b/test/cases/neuron-training/manifests/neuron-training.yaml @@ -8,7 +8,7 @@ spec: completionMode: Indexed completions: {{.NodeCount}} parallelism: {{.NodeCount}} - backoffLimit: 0 + backoffLimit: 3 template: metadata: annotations: @@ -21,18 +21,14 @@ spec: env: - name: MASTER_ADDR value: neuron-training-0.training - - name: MASTER_PORT - value: "{{.MasterPort}}" args: - sh - -c - - | + - | # Enable EFA https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-runtime/nrt-troubleshoot.html#fi-efa-fork-safe (AL2 legacy requirement) - export FI_EFA_FORK_SAFE=1 - torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR --master_port $MASTER_PORT train.py - ports: - - name: master - containerPort: {{.MasterPort}} + export FI_EFA_FORK_SAFE=1 + export CCOM_SOCKET_IFNAME=eth0 + torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR train.py volumeMounts: - name: dshm mountPath: /dev/shm diff --git a/test/cases/neuron-training/manifests/training-comm-service.yaml b/test/cases/neuron-training/manifests/training-comm-service.yaml index e04cd8691..b0026f5f0 100644 --- a/test/cases/neuron-training/manifests/training-comm-service.yaml +++ b/test/cases/neuron-training/manifests/training-comm-service.yaml @@ -6,9 +6,5 @@ metadata: app: training spec: clusterIP: None - ports: - - name: master # Port for torchrun master-worker node communication. - port: {{.MasterPort}} - targetPort: {{.MasterPort}} selector: - job-name: neuron-training # Selector for pods associated with this service. \ No newline at end of file + job-name: neuron-training \ No newline at end of file