Skip to content

Commit

Permalink
update script
Browse files Browse the repository at this point in the history
  • Loading branch information
cli99 committed May 28, 2024
1 parent 772ea4e commit 93ad8f1
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions examples/dbrx/run_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ model_name=dbrx # model name (hf model name) or model config json file path
seq_len=4096
gpu_name='h100-sxm-80gb' # python -m llm_analysis.config list_gpu_configs
dtype_name="w16a16e16" # 16-bit weights, activations, and embedding
batch_size_per_gpu=1 # device_train_microbatch_size
num_gpus=2048 # num_gpus
batch_size_per_gpu=2 # device_train_microbatch_size
num_gpus=3072 # num_gpus
activation_recomputation=0 # 0: no activation recomputation; 1: checkpoint attention compute; 2: checkpoint attention ; 3: checkpoint layernorm-attention-layernorm; 4: checkpoint attention the entire transformer layer
dp_size=128 # data parallelization size for sharding
ep_size=8 # expert parallelization size, moe_dp_sharding_size = dp_size / ep_size
tp_size=1 # tensor parallelization size, num_gpus = tp_size * dp_size
tp_size=1 # tensor parallelization size
ds_zero=3 # dp sharding strategy, https://github.com/cli99/llm-analysis#parallelism-scheme
mlp_activation_quant_bits=8 # number of bits used for mlp activation
mlp_recompute_gelu=True # whether to recompute the gelu in mlp backward
flops_efficiency=0.36721 # mfu
flops_efficiency=0.35 # mfu
hbm_memory_efficiency=1 # gpu memory efficiency
intra_node_memory_efficiency=0.8
inter_node_memory_efficiency=0.8
Expand All @@ -20,5 +20,6 @@ master_weights_dtype_bytes=4 # FP32 master weights
other_op_bytes=4 # lion optimizer
output_dir=output_dbrx
output_file_prefix="bs${batch_size_per_gpu}-ar${activation_recomputation}-zero${ds_zero}-"
layernorm_dtype_bytes=2

python -m llm_analysis.analysis train --model_name=${model_name} --seq_len=${seq_len} --gpu_name=${gpu_name} --dtype_name=${dtype_name} --output_dir=${output_dir} --output-file-suffix=${output_file_suffix} --activation_recomputation ${activation_recomputation} --ds_zero ${ds_zero} --batch_size_per_gpu=${batch_size_per_gpu} --total_num_gpus=${num_gpus} --tp_size=${tp_size} --ep_size=${ep_size} --flops_efficiency=${flops_efficiency} --hbm_memory_efficiency=${hbm_memory_efficiency} --total_num_tokens ${total_num_tokens} --mlp_activation_quant_bits ${mlp_activation_quant_bits} --layernorm_dtype_bytes 2 --mlp_recompute_gelu ${mlp_recompute_gelu} --master_weights_dtype_bytes ${master_weights_dtype_bytes} --other_op_bytes ${other_op_bytes} --intra_node_memory_efficiency ${intra_node_memory_efficiency} --inter_node_memory_efficiency ${inter_node_memory_efficiency} --bwd_prefetch 1
python -m llm_analysis.analysis train --model_name=${model_name} --seq_len=${seq_len} --gpu_name=${gpu_name} --dtype_name=${dtype_name} --output_dir=${output_dir} --output-file-prefix=${output_file_prefix} --activation_recomputation ${activation_recomputation} --ds_zero ${ds_zero} --batch_size_per_gpu=${batch_size_per_gpu} --total_num_gpus=${num_gpus} --dp_size=${dp_size} --tp_size=${tp_size} --ep_size=${ep_size} --flops_efficiency=${flops_efficiency} --hbm_memory_efficiency=${hbm_memory_efficiency} --total_num_tokens ${total_num_tokens} --mlp_activation_quant_bits ${mlp_activation_quant_bits} --layernorm_dtype_bytes=${layernorm_dtype_bytes} --mlp_recompute_gelu ${mlp_recompute_gelu} --master_weights_dtype_bytes ${master_weights_dtype_bytes} --other_op_bytes ${other_op_bytes} --intra_node_memory_efficiency ${intra_node_memory_efficiency} --inter_node_memory_efficiency ${inter_node_memory_efficiency}

0 comments on commit 93ad8f1

Please sign in to comment.