-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #176 from johnbensnyder/mixtral
added mixtral and hybrid shard support
- Loading branch information
Showing
6 changed files
with
206 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
91 changes: 91 additions & 0 deletions
91
3.test_cases/10.FSDP/2.distributed-training-mistral.sbatch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH --nodes=4 # number of nodes to use | ||
#SBATCH --job-name=FSDP # name of your job | ||
#SBATCH --exclusive # job has exclusive use of the resource, no sharing | ||
|
||
set -ex; | ||
|
||
########################### | ||
###### User Variables ##### | ||
########################### | ||
|
||
GPUS_PER_NODE=8 # 4 for G5.12x, 8 for P4/P5 | ||
|
||
########################### | ||
## Environment Variables ## | ||
########################### | ||
|
||
## Plenty of EFA level variables | ||
## Comment out for non-efa instances (G4d, P3) | ||
## For G5.12x, Comment out RDMA and Fork safe | ||
## For G4dn and other G5, comment out all | ||
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d | ||
export FI_EFA_FORK_SAFE=1 | ||
export FI_LOG_LEVEL=1 | ||
export FI_PROVIDER=efa | ||
export NCCL_DEBUG=INFO | ||
|
||
########################### | ||
####### Torch Dist ####### | ||
########################### | ||
|
||
declare -a TORCHRUN_ARGS=( | ||
--nproc_per_node=$GPUS_PER_NODE \ | ||
--nnodes=$SLURM_JOB_NUM_NODES \ | ||
--rdzv_id=$SLURM_JOB_ID \ | ||
--rdzv_backend=c10d \ | ||
--rdzv_endpoint=$(hostname) \ | ||
) | ||
|
||
export TORCHRUN=./pt_fsdp/bin/torchrun | ||
export TRAIN_SCRIPT=./train.py | ||
|
||
############################ | ||
# Mixtral Training Params ## | ||
############################ | ||
|
||
declare -a TRAINING_ARGS=( | ||
--train_batch_size=4 \ | ||
--val_batch_size=4 \ | ||
--max_steps=5000 \ | ||
--seed=42 \ | ||
--bf16=1 \ | ||
--grad_clip=1.0 \ | ||
--weight_decay=0.2 \ | ||
--beta1=0.9 \ | ||
--beta2=0.95 \ | ||
--activation_checkpointing=1 \ | ||
--intermediate_size=14336 \ | ||
--num_key_value_heads=8 \ | ||
--logging_freq=1 \ | ||
--max_context_width=32768 \ | ||
--vocab_size=32000 \ | ||
--hidden_width=4096 \ | ||
--num_layers=32 \ | ||
--num_heads=32 \ | ||
--resid_pdrop=0.1 \ | ||
--embd_pdrop=0.1 \ | ||
--attn_pdrop=0.1 \ | ||
--summary_first_pdrop=0.1 \ | ||
--initializer_range=0.02 \ | ||
--model_type="mixtral" \ | ||
--rotary_pct=0.25 \ | ||
--rotary_emb_base=10000 \ | ||
--lr=0.0001 \ | ||
--lr_decay_style="cosine" \ | ||
--min_lr=1e-5 \ | ||
--warmup=0.0032 \ | ||
--plateau=0.0 \ | ||
--dataset="c4" \ | ||
--tokenizer="mistralai/Mixtral-8x7B-v0.1" \ | ||
--epochs=3 \ | ||
--dataset_config_name="en" \ | ||
--limit_all_gathers=1 \ | ||
--sharding_strategy="hybrid" | ||
) | ||
|
||
srun -l ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.