Skip to content

Commit

Permalink
Enable worker <-> master collective communication
Browse files Browse the repository at this point in the history
  • Loading branch information
mselim00 committed Jan 25, 2025
1 parent b4d5fbc commit caa3b1d
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 14 deletions.
14 changes: 5 additions & 9 deletions test/cases/neuron-training/manifests/neuron-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
completionMode: Indexed
completions: {{.NodeCount}}
parallelism: {{.NodeCount}}
backoffLimit: 0
backoffLimit: 3
template:
metadata:
annotations:
Expand All @@ -21,18 +21,14 @@ spec:
env:
- name: MASTER_ADDR
value: neuron-training-0.training
- name: MASTER_PORT
value: "{{.MasterPort}}"
args:
- sh
- -c
- |
- |
# Enable EFA https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-runtime/nrt-troubleshoot.html#fi-efa-fork-safe (AL2 legacy requirement)
export FI_EFA_FORK_SAFE=1
torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR --master_port $MASTER_PORT train.py
ports:
- name: master
containerPort: {{.MasterPort}}
export FI_EFA_FORK_SAFE=1
export CCOM_SOCKET_IFNAME=eth0
torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR train.py
volumeMounts:
- name: dshm
mountPath: /dev/shm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,5 @@ metadata:
app: training
spec:
clusterIP: None
ports:
- name: master # Port for torchrun master-worker node communication.
port: {{.MasterPort}}
targetPort: {{.MasterPort}}
selector:
job-name: neuron-training # Selector for pods associated with this service.
job-name: neuron-training

0 comments on commit caa3b1d

Please sign in to comment.