Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encoding checkpoint reshaping guide #349

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@
get_tokenizer)
from megatron.enums import PositionEmbeddingType


from deepspeed.checkpoint import (
ORIGINAL_VOCAB_SIZE,
PADDED_VOCAB_SIZE,
UNIVERSAL_CHECKPOINT_INFO,
UNIVERSAL_CHECKPOINT_VERSION_KEY,
UNIVERSAL_CHECKPOINT_VERSION_VALUE,
VOCABULARY_PARAMETER_PATTERNS,
PIPELINE_REPLICATED_PARAMETER_PATTERNS,
PARAMETER_TO_AVERAGE_PATTERNS,
PARAMETER_WITH_ROW_PARALLELISM_PATTERNS,
)

_CHECKPOINT_VERSION = None

def set_checkpoint_version(value):
Expand Down Expand Up @@ -133,6 +146,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
state_dict['iteration'] = iteration
state_dict['tokens'] = args.consumed_train_tokens
state_dict['checkpoint_info'] = _checkpoint_info()
state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info()

# DeepSpeed saves the model/optimizer/scheduler
if not args.deepspeed:
Expand Down Expand Up @@ -480,4 +494,42 @@ def _checkpoint_info():
return {
"padded_vocab_size": args.padded_vocab_size,
"original_vocab_size": tokenizer.vocab_size,
}
}

def _universal_checkpoint_info():
args = get_args()
tokenizer = get_tokenizer()

info = dict()
info[UNIVERSAL_CHECKPOINT_VERSION_KEY] = UNIVERSAL_CHECKPOINT_VERSION_VALUE
info[ORIGINAL_VOCAB_SIZE] = tokenizer.vocab_size
info[PADDED_VOCAB_SIZE] = args.padded_vocab_size

# Vocabulary parameters (embeddings) that require special handling due to padding.
info[VOCABULARY_PARAMETER_PATTERNS] = ["word_embeddings.weight"]

# Replicated (shared) parameters on the pipeline dimension
info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = ["word_embeddings.weight"]

# Parameter slices that should be averaged not concatenated.
info[PARAMETER_TO_AVERAGE_PATTERNS] = [
r"tied_modules.embed.word_embeddings.norm.weight",
r"tied_modules.embed.word_embeddings.norm.bias",
r"\d+.input_layernorm.weight",
r"\d+.input_layernorm.bias",
r"\d+.post_attention_layernorm.weight",
r"\d+.post_attention_layernorm.bias",
r"\d+.self_attention.dense.bias",
r"\d+.mlp.dense_4h_to_h.bias",
r"\d+.weight",
r"\d+.bias",
]

# Parameter that are sliced on the row dimension
info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = [
"dense_4h_to_h.weight",
"self_attention.dense.weight",
]

return info

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ six
tensorboard
torch>=1.7
transformers
DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
#DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
# versions from HF transformers
black==21.4b0
isort>=5.5.4
22 changes: 12 additions & 10 deletions run_bf16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,40 +30,42 @@ CONFIG_JSON="$script_dir/ds_config.json"

USE_DEEPSPEED=1
ZERO_STAGE=0
DTYPE="bf16"

#TP=4
#PP=4

# Debug
DEBUG_MODE=0
DEBUG_MODE=1
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=3
EXIT_INTERVAL=100
RUN_TAG="toy"
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=10
EXIT_INTERVAL=100
RUN_TAG="big"
fi

TP=2
PP=2
DP=4
DP=2
WORLD_SIZE=$((TP*PP*DP))
GLOBAL_BATCH=4

MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}_$RUN_TAG
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}_$RUN_TAG

LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="bf16"
EXP_DIR=${HOME}/experiments/results/ckpt_reshape
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont"
EXP_DIR=${HOME}/experiments/results/uni_ckpt
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont_$RUN_TAG"
mkdir -p $LOG_DIR

while [[ $# -gt 0 ]]
Expand Down Expand Up @@ -166,7 +168,7 @@ cat <<EOT > $CONFIG_JSON
}
EOT

#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
Expand Down
58 changes: 35 additions & 23 deletions run_fp16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,47 +12,54 @@ DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"

BASE_DATA_PATH=/data/Megatron-LM/data
BASE_DATA_PATH=/vc_data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt


script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
CONFIG_JSON="/tmp/ds_config.json"
CONFIG_JSON="$script_dir/ds_config.json"
#CONFIG_JSON="/tmp/ds_config.json"

USE_DEEPSPEED=1
ZERO_STAGE=0

ZERO_STAGE=2
DTYPE="fp16"

# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"

# Debug
DEBUG_MODE=1
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=100
RUN_TAG="toy"
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=100
RUN_TAG="big"
fi

TP=1
PP=1
DP=2
DP=1
WORLD_SIZE=$((TP*PP*DP))
HIDDEN=1024
LAYERS=24
SEQ=1024
GLOBAL_BATCH=1
WORKER_STR=""
GLOBAL_BATCH=4

MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_$RUN_TAG
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_$RUN_TAG
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="fp16"
EXP_DIR=${HOME}/experiments/results/bf16
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
EXP_DIR="${HOME}/experiments/results/z${ZERO_STAGE}_uni_ckpt"
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont_$RUN_TAG"
mkdir -p $LOG_DIR

while [[ $# -gt 0 ]]
Expand Down Expand Up @@ -88,7 +95,7 @@ options=" \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--train-iters $TRAIN_ITERS \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
Expand All @@ -98,7 +105,7 @@ options=" \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 10000 \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
Expand All @@ -107,7 +114,12 @@ options=" \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval 10000 \
--exit-interval ${EXIT_INTERVAL} \
--save ${CHECKPOINT_PATH} \
--load ${LOAD_CHECKPOINT_PATH} \
--position-embedding-type alibi \
--override-lr-scheduler \
--embed-layernorm \
--tensorboard-dir $LOG_DIR
"

Expand Down
26 changes: 14 additions & 12 deletions run_universal_bf16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,40 +30,42 @@ CONFIG_JSON="$script_dir/ds_config.json"

USE_DEEPSPEED=1
ZERO_STAGE=0
DTYPE="bf16"

#TP=4
#PP=4

# Debug
DEBUG_MODE=0
DEBUG_MODE=1
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=3
EXIT_INTERVAL=100
RUN_TAG="toy"
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=10
EXIT_INTERVAL=100
RUN_TAG="big"
fi

TP=2
PP=2
DP=4
TP=1
PP=1
DP=2
WORLD_SIZE=$((TP*PP*DP))
GLOBAL_BATCH=4

MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}_$RUN_TAG
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp2_$RUN_TAG

LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="bf16"
EXP_DIR=${HOME}/experiments/results/ckpt_reshape
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
EXP_DIR=${HOME}/experiments/results/uni_ckpt
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_ref_uni_$RUN_TAG"
mkdir -p $LOG_DIR

while [[ $# -gt 0 ]]
Expand Down Expand Up @@ -167,7 +169,7 @@ cat <<EOT > $CONFIG_JSON
}
EOT

#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
Expand Down
Loading