From 42f4b6ebac0f1b0fc782ccf736ba91b882d45dc3 Mon Sep 17 00:00:00 2001 From: stceum <50257864+stceum@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:04:13 +0800 Subject: [PATCH] Add opt 350M training log demo for step 2 dpo finetuning in DeepSpeed-Chat. --- .../opt-350M_globalBatchSize-32.log | 6409 +++++++++++++++++ 1 file changed, 6409 insertions(+) create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log new file mode 100644 index 000000000..b8b160e40 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log @@ -0,0 +1,6409 @@ +[2025-01-02 15:45:19,529] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:22,160] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2025-01-02 15:45:22,160] [INFO] [runner.py:607:main] cmd = .venv/dsexamples/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf --data_split 2,4,4 --model_name_or_path facebook/opt-350m --per_device_train_batch_size 2 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 5e-5 --weight_decay 0.1 --num_train_epochs 1 --dropout 0.0 --gradient_accumulation_steps 2 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 0 --deepspeed --output_dir ./output +[2025-01-02 15:45:24,254] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:26,831] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2025-01-02 15:45:26,831] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0 +[2025-01-02 15:45:26,831] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2025-01-02 15:45:26,832] [INFO] [launch.py:164:main] dist_world_size=8 +[2025-01-02 15:45:26,832] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2025-01-02 15:45:26,832] [INFO] [launch.py:256:main] process 106031 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=0', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106032 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=1', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106033 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=2', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106034 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=3', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106035 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=4', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106036 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=5', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106037 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=6', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,836] [INFO] [launch.py:256:main] process 106038 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=7', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:30,732] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:30,910] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,042] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,049] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,103] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,135] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,147] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:33,071] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:33,584] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:33,584] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[rank3]:[W102 15:45:34.866823429 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,383] [INFO] [comm.py:652:init_distributed] cdb=None +[rank4]:[W102 15:45:34.247342944 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,450] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None +[rank2]:[W102 15:45:34.334422354 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank1]:[W102 15:45:34.339768589 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank6]:[W102 15:45:34.340404849 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,557] [INFO] [comm.py:652:init_distributed] cdb=None +[rank7]:[W102 15:45:34.425895009 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,604] [INFO] [comm.py:652:init_distributed] cdb=None +[rank5]:[W102 15:45:34.470924200 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank0]:[W102 15:45:34.726812378 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.05063033103942871 seconds +[2025-01-02 15:46:44,140] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.04431414604187012 seconds +[2025-01-02 15:46:44,304] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.11332392692565918 seconds +[2025-01-02 15:46:44,637] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown +[2025-01-02 15:46:44,637] [INFO] [comm.py:677:init_distributed] Distributed backend already initialized +[2025-01-02 15:46:44,637] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.10501360893249512 seconds +[2025-01-02 15:46:45,431] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.033315420150756836 seconds +[2025-01-02 15:46:48,423] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.1015481948852539 seconds +[2025-01-02 15:46:49,014] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.13011908531188965 seconds +[2025-01-02 15:46:50,464] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Loading extension module fused_adam... +Time to load fused_adam op: 0.20447063446044922 seconds +[2025-01-02 15:46:50,545] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,765] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-01-02 15:46:55,766] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2025-01-02 15:46:55,767] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05, 5e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:46:55,968] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] amp_params ................... False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] bfloat16_enabled ............. False +[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] comms_config ................. +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dump_state ................... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_auto_cast ............... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_enabled ................. True +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 2 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 65536 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] loss_scale ................... 0 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step2_tensorboard/ds_tensorboard_logs/', job_name='step2_model_tensorboard') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_name ............... None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_params ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pld_params ................... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] steps_per_print .............. 10 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] train_batch_size ............. 32 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 2 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] world_size ................... 8 +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=30000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_enabled ................. False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_optimization_stage ...... 0 +[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,972] [INFO] [config.py:989:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "steps_per_print": 10, + "zero_optimization": { + "stage": 0, + "overlap_comm": true, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "stage3_param_persistence_threshold": 1.000000e+04, + "stage3_max_live_parameters": 3.000000e+07, + "stage3_prefetch_bucket_size": 3.000000e+07, + "memory_efficient_linear": false + }, + "fp16": { + "enabled": true, + "loss_scale_window": 100 + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "hybrid_engine": { + "enabled": false, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + }, + "tensorboard": { + "enabled": false, + "output_path": "step2_tensorboard/ds_tensorboard_logs/", + "job_name": "step2_model_tensorboard" + } +} +[2025-01-02 15:46:55,972] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown +[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,978] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:56,023] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:47:02,563] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-01-02 15:47:02,565] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] amp_params ................... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] bfloat16_enabled ............. False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] comms_config ................. +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dump_state ................... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_auto_cast ............... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_enabled ................. True +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 2 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 65536 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] loss_scale ................... 0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_name ............... None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_params ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pld_params ................... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] steps_per_print .............. 10 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] train_batch_size ............. 32 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 2 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] world_size ................... 8 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_enabled ................. False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_optimization_stage ...... 0 +[2025-01-02 15:47:02,568] [INFO] [config.py:989:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "steps_per_print": 10, + "zero_optimization": { + "stage": 0, + "stage3_param_persistence_threshold": 1.000000e+04, + "offload_param": { + "device": "none" + }, + "memory_efficient_linear": false + }, + "fp16": { + "enabled": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false +} +***** Running training ***** +***** Evaluating rewards, Epoch 1/1 ***** +chosen: 0.0, rejected: 0.0, loss: 0.693359375 +Beginning of Epoch 1/1, Total Micro Batches 4708 +Model Parameters: 0.331 B, Latency: 0.27s, TFLOPs: 1.02, Samples/sec: 7.28, Time/seq 0.14s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,713] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,713] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.06, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.10s, TFLOPs: 2.70, Samples/sec: 19.26, Time/seq 0.05s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +Model Parameters: 0.331 B, Latency: 0.52s, TFLOPs: 0.54, Samples/sec: 3.84, Time/seq 0.26s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.62s, TFLOPs: 0.46, Samples/sec: 3.25, Time/seq 0.31s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,841] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=4, lr=[4.999919851200522e-05, 4.999919851200522e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:38,842] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=46.98077573584925, CurrSamplesPerSec=50.86885115471782, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:45,728] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=4, lr=[4.999430071591966e-05, 4.999430071591966e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:45,746] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=46.82057321463456, CurrSamplesPerSec=46.3240790591193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:52,687] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=4, lr=[4.998495126612987e-05, 4.998495126612987e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:52,708] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=46.657677501465244, CurrSamplesPerSec=46.740073109032714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:59,588] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=4, lr=[4.9971151827835975e-05, 4.9971151827835975e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:59,608] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=46.67284201254282, CurrSamplesPerSec=46.70549424381561, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:06,507] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=4, lr=[4.995290485881111e-05, 4.995290485881111e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:06,528] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=46.658721893844756, CurrSamplesPerSec=46.780295098825675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:13,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=4, lr=[4.993021360896366e-05, 4.993021360896366e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:13,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=46.67496477850472, CurrSamplesPerSec=46.74603117536787, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:20,321] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=4, lr=[4.99030821197584e-05, 4.99030821197584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:20,340] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=46.66552508644222, CurrSamplesPerSec=46.58816877398781, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:27,157] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=5, lr=[4.987487135239265e-05, 4.987487135239265e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:27,178] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=46.72639900113086, CurrSamplesPerSec=46.69866909880398, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:34,057] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=5, lr=[4.983931737433311e-05, 4.983931737433311e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:34,078] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=46.72344397631967, CurrSamplesPerSec=46.670138855517386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:40,995] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=5, lr=[4.979933934614882e-05, 4.979933934614882e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:41,016] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=46.69784876342507, CurrSamplesPerSec=46.6644759276489, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:47,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=5, lr=[4.9754944388196535e-05, 4.9754944388196535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:47,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=46.697522740072756, CurrSamplesPerSec=46.6885325659028, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:54,803] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=5, lr=[4.970614040751798e-05, 4.970614040751798e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:54,823] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=46.69541297477131, CurrSamplesPerSec=46.75664876248707, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:01,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=5, lr=[4.96529360964316e-05, 4.96529360964316e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:01,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=46.69230036582147, CurrSamplesPerSec=46.71986606957944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:08,600] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=6, lr=[4.9601297749741036e-05, 4.9601297749741036e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:08,620] [INFO] [timer.py:264:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=46.701067188567556, CurrSamplesPerSec=46.34668162363741, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:15,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=6, lr=[4.9539759563783176e-05, 4.9539759563783176e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:15,575] [INFO] [timer.py:264:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=46.676625606909724, CurrSamplesPerSec=46.37509004775261, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:22,519] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=6, lr=[4.947385068096907e-05, 4.947385068096907e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:22,537] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=46.652045629756806, CurrSamplesPerSec=45.754688762495974, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:29,477] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=6, lr=[4.940358284011574e-05, 4.940358284011574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:29,498] [INFO] [timer.py:264:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=46.63235423832134, CurrSamplesPerSec=46.33578549044944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:36,430] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=6, lr=[4.9328968556400026e-05, 4.9328968556400026e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:36,451] [INFO] [timer.py:264:stop] epoch=0/micro_step=360/global_step=180, RunningAvgSamplesPerSec=46.617390821416194, CurrSamplesPerSec=46.339240977849904, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:43,383] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=6, lr=[4.9250021119129636e-05, 4.9250021119129636e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:43,403] [INFO] [timer.py:264:stop] epoch=0/micro_step=380/global_step=190, RunningAvgSamplesPerSec=46.60359803968563, CurrSamplesPerSec=46.34333703776871, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:50,336] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=6, lr=[4.916675458937614e-05, 4.916675458937614e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:50,356] [INFO] [timer.py:264:stop] epoch=0/micro_step=400/global_step=200, RunningAvgSamplesPerSec=46.59140057519339, CurrSamplesPerSec=46.578597409153254, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,558] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 1024.0, reducing to 512.0 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:57,226] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=7, lr=[4.908813412994094e-05, 4.908813412994094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:57,247] [INFO] [timer.py:264:stop] epoch=0/micro_step=420/global_step=210, RunningAvgSamplesPerSec=46.60008012821061, CurrSamplesPerSec=46.86441592162194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.51, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:04,214] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=7, lr=[4.899670281569845e-05, 4.899670281569845e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:04,232] [INFO] [timer.py:264:stop] epoch=0/micro_step=440/global_step=220, RunningAvgSamplesPerSec=46.580701736622736, CurrSamplesPerSec=45.654661323911306, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:11,163] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=7, lr=[4.890099752667294e-05, 4.890099752667294e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:11,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=460/global_step=230, RunningAvgSamplesPerSec=46.57159456525261, CurrSamplesPerSec=46.32703709314177, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:18,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=7, lr=[4.880103530862256e-05, 4.880103530862256e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:18,131] [INFO] [timer.py:264:stop] epoch=0/micro_step=480/global_step=240, RunningAvgSamplesPerSec=46.563629676576625, CurrSamplesPerSec=46.309614118584825, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:25,065] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=7, lr=[4.86968339654932e-05, 4.86968339654932e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:25,086] [INFO] [timer.py:264:stop] epoch=0/micro_step=500/global_step=250, RunningAvgSamplesPerSec=46.554810311450424, CurrSamplesPerSec=46.38267044164213, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:32,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=7, lr=[4.858841205624759e-05, 4.858841205624759e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:32,046] [INFO] [timer.py:264:stop] epoch=0/micro_step=520/global_step=260, RunningAvgSamplesPerSec=46.54628059347242, CurrSamplesPerSec=46.90926209286344, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:38,997] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=7, lr=[4.8475788891559783e-05, 4.8475788891559783e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:39,018] [INFO] [timer.py:264:stop] epoch=0/micro_step=540/global_step=270, RunningAvgSamplesPerSec=46.535565503469485, CurrSamplesPerSec=46.32256021546509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:45,948] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=7, lr=[4.835898453037574e-05, 4.835898453037574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:45,969] [INFO] [timer.py:264:stop] epoch=0/micro_step=560/global_step=280, RunningAvgSamplesPerSec=46.52910793247543, CurrSamplesPerSec=46.34376908630242, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:52,901] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=7, lr=[4.823801977634082e-05, 4.823801977634082e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:52,922] [INFO] [timer.py:264:stop] epoch=0/micro_step=580/global_step=290, RunningAvgSamplesPerSec=46.52320489698547, CurrSamplesPerSec=46.32582185506885, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:59,855] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=7, lr=[4.811291617409437e-05, 4.811291617409437e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:59,875] [INFO] [timer.py:264:stop] epoch=0/micro_step=600/global_step=300, RunningAvgSamplesPerSec=46.51706229660043, CurrSamplesPerSec=46.36179428641033, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:06,787] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=7, lr=[4.7983696005432587e-05, 4.7983696005432587e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:06,827] [INFO] [timer.py:264:stop] epoch=0/micro_step=620/global_step=310, RunningAvgSamplesPerSec=46.511774321108796, CurrSamplesPerSec=46.36275517013998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:13,760] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=7, lr=[4.7850382285339924e-05, 4.7850382285339924e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:13,780] [INFO] [timer.py:264:stop] epoch=0/micro_step=640/global_step=320, RunningAvgSamplesPerSec=46.50668977864453, CurrSamplesPerSec=46.34026492494737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:20,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=7, lr=[4.771299875788999e-05, 4.771299875788999e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:20,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=660/global_step=330, RunningAvgSamplesPerSec=46.502571796260625, CurrSamplesPerSec=46.319203124992384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:27,664] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=7, lr=[4.7571569892016555e-05, 4.7571569892016555e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:27,685] [INFO] [timer.py:264:stop] epoch=0/micro_step=680/global_step=340, RunningAvgSamplesPerSec=46.49864772289161, CurrSamplesPerSec=46.32430289713105, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:34,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=7, lr=[4.742612087715547e-05, 4.742612087715547e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:34,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=700/global_step=350, RunningAvgSamplesPerSec=46.495692632814894, CurrSamplesPerSec=46.390253314093556, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:41,565] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=7, lr=[4.727667761875828e-05, 4.727667761875828e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:41,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=720/global_step=360, RunningAvgSamplesPerSec=46.491304442529376, CurrSamplesPerSec=46.325166292582765, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:48,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=7, lr=[4.712326673367824e-05, 4.712326673367824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:48,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=740/global_step=370, RunningAvgSamplesPerSec=46.487412684576384, CurrSamplesPerSec=46.33804109742901, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:55,472] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=7, lr=[4.696591554542973e-05, 4.696591554542973e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:55,493] [INFO] [timer.py:264:stop] epoch=0/micro_step=760/global_step=380, RunningAvgSamplesPerSec=46.48344060347751, CurrSamplesPerSec=46.31380082968855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:02,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=7, lr=[4.6804652079321726e-05, 4.6804652079321726e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:02,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=780/global_step=390, RunningAvgSamplesPerSec=46.48118928657953, CurrSamplesPerSec=46.933735963905434, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:09,377] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=7, lr=[4.663950505746629e-05, 4.663950505746629e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:09,398] [INFO] [timer.py:264:stop] epoch=0/micro_step=800/global_step=400, RunningAvgSamplesPerSec=46.47709234422624, CurrSamplesPerSec=46.35471700350897, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:16,305] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,307] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=7, lr=[4.6470503893662995e-05, 4.6470503893662995e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:16,345] [INFO] [timer.py:264:stop] epoch=0/micro_step=820/global_step=410, RunningAvgSamplesPerSec=46.47536079054126, CurrSamplesPerSec=46.36307547356772, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:23,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=7, lr=[4.6297678688160096e-05, 4.6297678688160096e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:23,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=840/global_step=420, RunningAvgSamplesPerSec=46.47249330297309, CurrSamplesPerSec=46.40106279118714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:30,233] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=7, lr=[4.612106022229352e-05, 4.612106022229352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:30,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=860/global_step=430, RunningAvgSamplesPerSec=46.469326490503796, CurrSamplesPerSec=46.32311977785527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:37,190] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=7, lr=[4.594067995300447e-05, 4.594067995300447e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:37,210] [INFO] [timer.py:264:stop] epoch=0/micro_step=880/global_step=440, RunningAvgSamplesPerSec=46.466282683995, CurrSamplesPerSec=46.33493769496381, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:44,142] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=7, lr=[4.57565700072367e-05, 4.57565700072367e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:44,162] [INFO] [timer.py:264:stop] epoch=0/micro_step=900/global_step=450, RunningAvgSamplesPerSec=46.463773986613354, CurrSamplesPerSec=46.314376162125654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:51,093] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=7, lr=[4.556876317621458e-05, 4.556876317621458e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:51,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=920/global_step=460, RunningAvgSamplesPerSec=46.46132522763121, CurrSamplesPerSec=46.33345012880701, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:58,047] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=7, lr=[4.5377292909602656e-05, 4.5377292909602656e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:58,068] [INFO] [timer.py:264:stop] epoch=0/micro_step=940/global_step=470, RunningAvgSamplesPerSec=46.45878779560237, CurrSamplesPerSec=46.43445319841673, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:04,937] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=8, lr=[4.520186560426292e-05, 4.520186560426292e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:04,958] [INFO] [timer.py:264:stop] epoch=0/micro_step=960/global_step=480, RunningAvgSamplesPerSec=46.4652574508203, CurrSamplesPerSec=46.38100350402843, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:11,893] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=8, lr=[4.5003529295830075e-05, 4.5003529295830075e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:11,913] [INFO] [timer.py:264:stop] epoch=0/micro_step=980/global_step=490, RunningAvgSamplesPerSec=46.46303714998046, CurrSamplesPerSec=46.34392910632221, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:18,845] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=8, lr=[4.4801630223777665e-05, 4.4801630223777665e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:18,865] [INFO] [timer.py:264:stop] epoch=0/micro_step=1000/global_step=500, RunningAvgSamplesPerSec=46.46079008466478, CurrSamplesPerSec=46.36059323775675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:25,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=8, lr=[4.459620434769351e-05, 4.459620434769351e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:25,819] [INFO] [timer.py:264:stop] epoch=0/micro_step=1020/global_step=510, RunningAvgSamplesPerSec=46.45906494240357, CurrSamplesPerSec=46.75834281138947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:32,751] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=8, lr=[4.438728825531305e-05, 4.438728825531305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:32,772] [INFO] [timer.py:264:stop] epoch=0/micro_step=1040/global_step=520, RunningAvgSamplesPerSec=46.45683775721524, CurrSamplesPerSec=46.369770828201965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:39,703] [INFO] [logging.py:128:log_dist] [Rank 0] step=530, skipped=8, lr=[4.417491915600285e-05, 4.417491915600285e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:39,724] [INFO] [timer.py:264:stop] epoch=0/micro_step=1060/global_step=530, RunningAvgSamplesPerSec=46.454981721927275, CurrSamplesPerSec=46.67327109354972, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:46,661] [INFO] [logging.py:128:log_dist] [Rank 0] step=540, skipped=8, lr=[4.395913487413324e-05, 4.395913487413324e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:46,681] [INFO] [timer.py:264:stop] epoch=0/micro_step=1080/global_step=540, RunningAvgSamplesPerSec=46.45267445975501, CurrSamplesPerSec=46.34778592305895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:53,612] [INFO] [logging.py:128:log_dist] [Rank 0] step=550, skipped=8, lr=[4.37399738423417e-05, 4.37399738423417e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:53,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1100/global_step=550, RunningAvgSamplesPerSec=46.45093776947002, CurrSamplesPerSec=46.396587634057866, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:00,566] [INFO] [logging.py:128:log_dist] [Rank 0] step=560, skipped=8, lr=[4.351747509468763e-05, 4.351747509468763e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:00,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=1120/global_step=560, RunningAvgSamplesPerSec=46.44909549888697, CurrSamplesPerSec=46.45555564637373, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:07,514] [INFO] [logging.py:128:log_dist] [Rank 0] step=570, skipped=8, lr=[4.3291678259700163e-05, 4.3291678259700163e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:07,534] [INFO] [timer.py:264:stop] epoch=0/micro_step=1140/global_step=570, RunningAvgSamplesPerSec=46.44810234293339, CurrSamplesPerSec=46.34565739295321, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,279] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:14,467] [INFO] [logging.py:128:log_dist] [Rank 0] step=580, skipped=8, lr=[4.306262355332006e-05, 4.306262355332006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:14,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=1160/global_step=580, RunningAvgSamplesPerSec=46.446247391835065, CurrSamplesPerSec=46.379192448009945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:21,419] [INFO] [logging.py:128:log_dist] [Rank 0] step=590, skipped=8, lr=[4.2830351771736965e-05, 4.2830351771736965e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:21,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=1180/global_step=590, RunningAvgSamplesPerSec=46.444511706837304, CurrSamplesPerSec=46.33538558305289, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:28,372] [INFO] [logging.py:128:log_dist] [Rank 0] step=600, skipped=8, lr=[4.259490428412335e-05, 4.259490428412335e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:28,392] [INFO] [timer.py:264:stop] epoch=0/micro_step=1200/global_step=600, RunningAvgSamplesPerSec=46.44315240764414, CurrSamplesPerSec=46.37826293436061, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:35,326] [INFO] [logging.py:128:log_dist] [Rank 0] step=610, skipped=8, lr=[4.235632302526635e-05, 4.235632302526635e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:35,347] [INFO] [timer.py:264:stop] epoch=0/micro_step=1220/global_step=610, RunningAvgSamplesPerSec=46.4417772135784, CurrSamplesPerSec=46.33301827265184, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:42,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=620, skipped=8, lr=[4.2114650488098936e-05, 4.2114650488098936e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:42,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=1240/global_step=620, RunningAvgSamplesPerSec=46.44041818234045, CurrSamplesPerSec=46.349610533055575, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:49,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=630, skipped=8, lr=[4.1869929716131605e-05, 4.1869929716131605e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:49,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=1260/global_step=630, RunningAvgSamplesPerSec=46.43896915559678, CurrSamplesPerSec=46.32174487732559, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:56,121] [INFO] [logging.py:128:log_dist] [Rank 0] step=640, skipped=9, lr=[4.164711079369153e-05, 4.164711079369153e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:56,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=1280/global_step=640, RunningAvgSamplesPerSec=46.44426552356086, CurrSamplesPerSec=46.362723140040615, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:03,067] [INFO] [logging.py:128:log_dist] [Rank 0] step=650, skipped=9, lr=[4.1396718898658025e-05, 4.1396718898658025e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:03,087] [INFO] [timer.py:264:stop] epoch=0/micro_step=1300/global_step=650, RunningAvgSamplesPerSec=46.443367108838785, CurrSamplesPerSec=46.33520962598524, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:10,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=660, skipped=9, lr=[4.1143406637287735e-05, 4.1143406637287735e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:10,045] [INFO] [timer.py:264:stop] epoch=0/micro_step=1320/global_step=660, RunningAvgSamplesPerSec=46.44134794762593, CurrSamplesPerSec=46.33818507979863, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:16,979] [INFO] [logging.py:128:log_dist] [Rank 0] step=670, skipped=9, lr=[4.088721912620461e-05, 4.088721912620461e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:17,000] [INFO] [timer.py:264:stop] epoch=0/micro_step=1340/global_step=670, RunningAvgSamplesPerSec=46.43998496782961, CurrSamplesPerSec=47.037639814383006, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:23,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=680, skipped=9, lr=[4.0628201994134016e-05, 4.0628201994134016e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:23,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=1360/global_step=680, RunningAvgSamplesPerSec=46.43896855112754, CurrSamplesPerSec=46.311851198280706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.93, Samples/sec: 13.76, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:30,886] [INFO] [logging.py:128:log_dist] [Rank 0] step=690, skipped=9, lr=[4.036640137377588e-05, 4.036640137377588e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:30,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1380/global_step=690, RunningAvgSamplesPerSec=46.437531826930254, CurrSamplesPerSec=45.93531883513, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:37,833] [INFO] [logging.py:128:log_dist] [Rank 0] step=700, skipped=9, lr=[4.010186389358825e-05, 4.010186389358825e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:37,854] [INFO] [timer.py:264:stop] epoch=0/micro_step=1400/global_step=700, RunningAvgSamplesPerSec=46.436798376271994, CurrSamplesPerSec=46.37616365270356, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:44,791] [INFO] [logging.py:128:log_dist] [Rank 0] step=710, skipped=9, lr=[3.983463666948233e-05, 3.983463666948233e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:44,811] [INFO] [timer.py:264:stop] epoch=0/micro_step=1420/global_step=710, RunningAvgSamplesPerSec=46.43521967480529, CurrSamplesPerSec=46.30605121647706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:51,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=720, skipped=9, lr=[3.9564767296430877e-05, 3.9564767296430877e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:51,765] [INFO] [timer.py:264:stop] epoch=0/micro_step=1440/global_step=720, RunningAvgSamplesPerSec=46.434081332236666, CurrSamplesPerSec=46.35162737441071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:58,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=730, skipped=9, lr=[3.929230383999124e-05, 3.929230383999124e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:58,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=1460/global_step=730, RunningAvgSamplesPerSec=46.43326436780265, CurrSamplesPerSec=46.40451197798106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,945] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,946] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:05,667] [INFO] [logging.py:128:log_dist] [Rank 0] step=740, skipped=9, lr=[3.901729482774453e-05, 3.901729482774453e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:05,684] [INFO] [timer.py:264:stop] epoch=0/micro_step=1480/global_step=740, RunningAvgSamplesPerSec=46.43108873755449, CurrSamplesPerSec=45.89287960413895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:12,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=750, skipped=9, lr=[3.8739789240652524e-05, 3.8739789240652524e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:12,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1500/global_step=750, RunningAvgSamplesPerSec=46.430940342580826, CurrSamplesPerSec=46.33469775847788, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:19,572] [INFO] [logging.py:128:log_dist] [Rank 0] step=760, skipped=9, lr=[3.845983650433384e-05, 3.845983650433384e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:19,593] [INFO] [timer.py:264:stop] epoch=0/micro_step=1520/global_step=760, RunningAvgSamplesPerSec=46.42927334106034, CurrSamplesPerSec=46.41226248648767, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:26,522] [INFO] [logging.py:128:log_dist] [Rank 0] step=770, skipped=9, lr=[3.817748648026087e-05, 3.817748648026087e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:26,542] [INFO] [timer.py:264:stop] epoch=0/micro_step=1540/global_step=770, RunningAvgSamplesPerSec=46.42870842332397, CurrSamplesPerSec=46.3644207962945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:33,474] [INFO] [logging.py:128:log_dist] [Rank 0] step=780, skipped=9, lr=[3.78927894568792e-05, 3.78927894568792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:33,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=1560/global_step=780, RunningAvgSamplesPerSec=46.427907498021575, CurrSamplesPerSec=46.35952035358448, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:40,426] [INFO] [logging.py:128:log_dist] [Rank 0] step=790, skipped=9, lr=[3.7605796140650764e-05, 3.7605796140650764e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:40,447] [INFO] [timer.py:264:stop] epoch=0/micro_step=1580/global_step=790, RunningAvgSamplesPerSec=46.42688505692321, CurrSamplesPerSec=46.38398484232087, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:47,379] [INFO] [logging.py:128:log_dist] [Rank 0] step=800, skipped=9, lr=[3.73165576470228e-05, 3.73165576470228e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:47,399] [INFO] [timer.py:264:stop] epoch=0/micro_step=1600/global_step=800, RunningAvgSamplesPerSec=46.42593307965039, CurrSamplesPerSec=46.31824404565779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:54,330] [INFO] [logging.py:128:log_dist] [Rank 0] step=810, skipped=9, lr=[3.70251254913238e-05, 3.70251254913238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:54,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=1620/global_step=810, RunningAvgSamplesPerSec=46.42510739233998, CurrSamplesPerSec=46.36661512111462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:01,282] [INFO] [logging.py:128:log_dist] [Rank 0] step=820, skipped=9, lr=[3.673155157958827e-05, 3.673155157958827e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:01,303] [INFO] [timer.py:264:stop] epoch=0/micro_step=1640/global_step=820, RunningAvgSamplesPerSec=46.42418767305092, CurrSamplesPerSec=46.3146478518564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:08,234] [INFO] [logging.py:128:log_dist] [Rank 0] step=830, skipped=9, lr=[3.6435888199311916e-05, 3.6435888199311916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:08,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=1660/global_step=830, RunningAvgSamplesPerSec=46.42334132276314, CurrSamplesPerSec=46.34514529458703, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:14,472] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,474] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,475] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:15,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=840, skipped=9, lr=[3.6138188010138916e-05, 3.6138188010138916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:15,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=1680/global_step=840, RunningAvgSamplesPerSec=46.42245676253584, CurrSamplesPerSec=46.35514926425546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:22,140] [INFO] [logging.py:128:log_dist] [Rank 0] step=850, skipped=9, lr=[3.583850403448287e-05, 3.583850403448287e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:22,161] [INFO] [timer.py:264:stop] epoch=0/micro_step=1700/global_step=850, RunningAvgSamplesPerSec=46.42177760625656, CurrSamplesPerSec=46.53673661663851, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:29,092] [INFO] [logging.py:128:log_dist] [Rank 0] step=860, skipped=9, lr=[3.5536889648083114e-05, 3.5536889648083114e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:29,113] [INFO] [timer.py:264:stop] epoch=0/micro_step=1720/global_step=860, RunningAvgSamplesPerSec=46.421073151354676, CurrSamplesPerSec=46.339704948271745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:36,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=870, skipped=9, lr=[3.523339857049819e-05, 3.523339857049819e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:36,066] [INFO] [timer.py:264:stop] epoch=0/micro_step=1740/global_step=870, RunningAvgSamplesPerSec=46.420195916054574, CurrSamplesPerSec=46.33840905415208, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:42,936] [INFO] [logging.py:128:log_dist] [Rank 0] step=880, skipped=10, lr=[3.495869669843086e-05, 3.495869669843086e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:42,956] [INFO] [timer.py:264:stop] epoch=0/micro_step=1760/global_step=880, RunningAvgSamplesPerSec=46.42414880906899, CurrSamplesPerSec=46.31806821875464, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:49,887] [INFO] [logging.py:128:log_dist] [Rank 0] step=890, skipped=10, lr=[3.4651789094342044e-05, 3.4651789094342044e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:49,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1780/global_step=890, RunningAvgSamplesPerSec=46.42345409193999, CurrSamplesPerSec=46.35346829552468, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:56,836] [INFO] [logging.py:128:log_dist] [Rank 0] step=900, skipped=10, lr=[3.434316244145236e-05, 3.434316244145236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:56,857] [INFO] [timer.py:264:stop] epoch=0/micro_step=1800/global_step=900, RunningAvgSamplesPerSec=46.42303656570756, CurrSamplesPerSec=46.4751966952545, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:03,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=910, skipped=10, lr=[3.403287170825234e-05, 3.403287170825234e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:03,805] [INFO] [timer.py:264:stop] epoch=0/micro_step=1820/global_step=910, RunningAvgSamplesPerSec=46.42259453897555, CurrSamplesPerSec=46.36161812871359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:10,733] [INFO] [logging.py:128:log_dist] [Rank 0] step=920, skipped=10, lr=[3.3720972159616496e-05, 3.3720972159616496e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:10,751] [INFO] [timer.py:264:stop] epoch=0/micro_step=1840/global_step=920, RunningAvgSamplesPerSec=46.42241240368338, CurrSamplesPerSec=46.487559281055454, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:17,702] [INFO] [logging.py:128:log_dist] [Rank 0] step=930, skipped=10, lr=[3.340751934696017e-05, 3.340751934696017e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:17,722] [INFO] [timer.py:264:stop] epoch=0/micro_step=1860/global_step=930, RunningAvgSamplesPerSec=46.420715482541, CurrSamplesPerSec=46.42123576943527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:24,619] [INFO] [logging.py:128:log_dist] [Rank 0] step=940, skipped=10, lr=[3.309256909834556e-05, 3.309256909834556e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:24,639] [INFO] [timer.py:264:stop] epoch=0/micro_step=1880/global_step=940, RunningAvgSamplesPerSec=46.42255350808398, CurrSamplesPerSec=46.7765126862402, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:31,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=950, skipped=10, lr=[3.2776177508538304e-05, 3.2776177508538304e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:31,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=1900/global_step=950, RunningAvgSamplesPerSec=46.42571670265899, CurrSamplesPerSec=46.77811035486935, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:38,418] [INFO] [logging.py:128:log_dist] [Rank 0] step=960, skipped=10, lr=[3.245840092901662e-05, 3.245840092901662e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:38,439] [INFO] [timer.py:264:stop] epoch=0/micro_step=1920/global_step=960, RunningAvgSamplesPerSec=46.428712246785175, CurrSamplesPerSec=46.81064173887472, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:45,316] [INFO] [logging.py:128:log_dist] [Rank 0] step=970, skipped=10, lr=[3.213929595793479e-05, 3.213929595793479e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:45,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=1940/global_step=970, RunningAvgSamplesPerSec=46.431706559142256, CurrSamplesPerSec=46.806527953026816, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,367] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.59, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:52,204] [INFO] [logging.py:128:log_dist] [Rank 0] step=980, skipped=10, lr=[3.1818919430042524e-05, 3.1818919430042524e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:52,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=1960/global_step=980, RunningAvgSamplesPerSec=46.43535870004616, CurrSamplesPerSec=46.78776389902927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:59,091] [INFO] [logging.py:128:log_dist] [Rank 0] step=990, skipped=10, lr=[3.1497328406562476e-05, 3.1497328406562476e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:59,112] [INFO] [timer.py:264:stop] epoch=0/micro_step=1980/global_step=990, RunningAvgSamplesPerSec=46.43904306339881, CurrSamplesPerSec=46.83297000125497, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.94, Samples/sec: 13.81, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:05,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1000, skipped=10, lr=[3.117458016502711e-05, 3.117458016502711e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:06,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2000/global_step=1000, RunningAvgSamplesPerSec=46.44231158064333, CurrSamplesPerSec=46.81330302519387, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:12,898] [INFO] [logging.py:128:log_dist] [Rank 0] step=1010, skipped=10, lr=[3.0850732189077236e-05, 3.0850732189077236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:12,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=2020/global_step=1010, RunningAvgSamplesPerSec=46.444083453859605, CurrSamplesPerSec=46.8065442761449, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:19,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=1020, skipped=10, lr=[3.05258421582238e-05, 3.05258421582238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:19,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=2040/global_step=1020, RunningAvgSamplesPerSec=46.447488410641995, CurrSamplesPerSec=46.75251188976779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:26,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1030, skipped=10, lr=[3.0199967937574774e-05, 3.0199967937574774e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:26,699] [INFO] [timer.py:264:stop] epoch=0/micro_step=2060/global_step=1030, RunningAvgSamplesPerSec=46.45030464457745, CurrSamplesPerSec=46.19032410349784, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,335] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.10, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:33,531] [INFO] [logging.py:128:log_dist] [Rank 0] step=1040, skipped=11, lr=[2.9905887623649602e-05, 2.9905887623649602e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:33,551] [INFO] [timer.py:264:stop] epoch=0/micro_step=2080/global_step=1040, RunningAvgSamplesPerSec=46.45609527814709, CurrSamplesPerSec=46.61336089809432, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:40,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1050, skipped=11, lr=[2.9578303480235774e-05, 2.9578303480235774e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:40,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=2100/global_step=1050, RunningAvgSamplesPerSec=46.45922477354279, CurrSamplesPerSec=46.80225168829745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:47,310] [INFO] [logging.py:128:log_dist] [Rank 0] step=1060, skipped=11, lr=[2.9249903910062116e-05, 2.9249903910062116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:47,330] [INFO] [timer.py:264:stop] epoch=0/micro_step=2120/global_step=1060, RunningAvgSamplesPerSec=46.46228296302194, CurrSamplesPerSec=46.73650876277751, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:54,242] [INFO] [logging.py:128:log_dist] [Rank 0] step=1070, skipped=11, lr=[2.8920747403309247e-05, 2.8920747403309247e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:54,262] [INFO] [timer.py:264:stop] epoch=0/micro_step=2140/global_step=1070, RunningAvgSamplesPerSec=46.46284313048135, CurrSamplesPerSec=46.67312502129054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:01,069] [INFO] [logging.py:128:log_dist] [Rank 0] step=1080, skipped=12, lr=[2.8623907817398308e-05, 2.8623907817398308e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:01,090] [INFO] [timer.py:264:stop] epoch=0/micro_step=2160/global_step=1080, RunningAvgSamplesPerSec=46.469588801539444, CurrSamplesPerSec=46.82241573242371, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:07,965] [INFO] [logging.py:128:log_dist] [Rank 0] step=1090, skipped=12, lr=[2.8293474746020472e-05, 2.8293474746020472e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:07,984] [INFO] [timer.py:264:stop] epoch=0/micro_step=2180/global_step=1090, RunningAvgSamplesPerSec=46.47228214300523, CurrSamplesPerSec=46.75736546050338, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:14,870] [INFO] [logging.py:128:log_dist] [Rank 0] step=1100, skipped=12, lr=[2.7962455084554778e-05, 2.7962455084554778e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:14,890] [INFO] [timer.py:264:stop] epoch=0/micro_step=2200/global_step=1100, RunningAvgSamplesPerSec=46.47416601952744, CurrSamplesPerSec=46.769764555584786, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:21,778] [INFO] [logging.py:128:log_dist] [Rank 0] step=1110, skipped=12, lr=[2.763090778983777e-05, 2.763090778983777e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=2220/global_step=1110, RunningAvgSamplesPerSec=46.47592055429421, CurrSamplesPerSec=46.91998677298789, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:28,694] [INFO] [logging.py:128:log_dist] [Rank 0] step=1120, skipped=12, lr=[2.729889191268107e-05, 2.729889191268107e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:28,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=2240/global_step=1120, RunningAvgSamplesPerSec=46.477148202453265, CurrSamplesPerSec=46.72287486730245, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:35,589] [INFO] [logging.py:128:log_dist] [Rank 0] step=1130, skipped=12, lr=[2.696646658735396e-05, 2.696646658735396e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:35,609] [INFO] [timer.py:264:stop] epoch=0/micro_step=2260/global_step=1130, RunningAvgSamplesPerSec=46.479514044879025, CurrSamplesPerSec=46.664411030997236, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:42,488] [INFO] [logging.py:128:log_dist] [Rank 0] step=1140, skipped=12, lr=[2.6633691021051226e-05, 2.6633691021051226e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:42,509] [INFO] [timer.py:264:stop] epoch=0/micro_step=2280/global_step=1140, RunningAvgSamplesPerSec=46.481591664999485, CurrSamplesPerSec=46.77589321144054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:49,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1150, skipped=12, lr=[2.6300624483347926e-05, 2.6300624483347926e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:49,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=2300/global_step=1150, RunningAvgSamplesPerSec=46.481755312094066, CurrSamplesPerSec=46.77967552383947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:56,307] [INFO] [logging.py:128:log_dist] [Rank 0] step=1160, skipped=12, lr=[2.596732629564309e-05, 2.596732629564309e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:56,328] [INFO] [timer.py:264:stop] epoch=0/micro_step=2320/global_step=1160, RunningAvgSamplesPerSec=46.48439949209591, CurrSamplesPerSec=46.79316314116429, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:03,200] [INFO] [logging.py:128:log_dist] [Rank 0] step=1170, skipped=12, lr=[2.56338558205942e-05, 2.56338558205942e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:03,221] [INFO] [timer.py:264:stop] epoch=0/micro_step=2340/global_step=1170, RunningAvgSamplesPerSec=46.48707192862625, CurrSamplesPerSec=46.97179330281597, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:10,094] [INFO] [logging.py:128:log_dist] [Rank 0] step=1180, skipped=12, lr=[2.5300272451544234e-05, 2.5300272451544234e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:10,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=2360/global_step=1180, RunningAvgSamplesPerSec=46.48953479190863, CurrSamplesPerSec=46.803459409078194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:16,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1190, skipped=12, lr=[2.496663560194338e-05, 2.496663560194338e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:17,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2380/global_step=1190, RunningAvgSamplesPerSec=46.49180947433954, CurrSamplesPerSec=46.80528742936384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:19,727] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:23,847] [INFO] [logging.py:128:log_dist] [Rank 0] step=1200, skipped=13, lr=[2.4666365824494565e-05, 2.4666365824494565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:23,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=2400/global_step=1200, RunningAvgSamplesPerSec=46.49591524184215, CurrSamplesPerSec=46.686599976696954, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:30,757] [INFO] [logging.py:128:log_dist] [Rank 0] step=1210, skipped=13, lr=[2.4332791071488294e-05, 2.4332791071488294e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:30,777] [INFO] [timer.py:264:stop] epoch=0/micro_step=2420/global_step=1210, RunningAvgSamplesPerSec=46.49732960112968, CurrSamplesPerSec=46.76973196064543, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:37,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1220, skipped=13, lr=[2.3999335152896784e-05, 2.3999335152896784e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:37,703] [INFO] [timer.py:264:stop] epoch=0/micro_step=2440/global_step=1220, RunningAvgSamplesPerSec=46.497692834200365, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.49, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.50, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:44,633] [INFO] [logging.py:128:log_dist] [Rank 0] step=1230, skipped=13, lr=[2.3666057459470436e-05, 2.3666057459470436e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:44,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=2460/global_step=1230, RunningAvgSamplesPerSec=46.49675848239761, CurrSamplesPerSec=46.493952415474645, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:51,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1240, skipped=13, lr=[2.3333017350216558e-05, 2.3333017350216558e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:51,600] [INFO] [timer.py:264:stop] epoch=0/micro_step=2480/global_step=1240, RunningAvgSamplesPerSec=46.49602569155278, CurrSamplesPerSec=46.5161407241932, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:58,509] [INFO] [logging.py:128:log_dist] [Rank 0] step=1250, skipped=13, lr=[2.300027414182708e-05, 2.300027414182708e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:58,529] [INFO] [timer.py:264:stop] epoch=0/micro_step=2500/global_step=1250, RunningAvgSamplesPerSec=46.49621684629403, CurrSamplesPerSec=46.497528184225175, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:05,438] [INFO] [logging.py:128:log_dist] [Rank 0] step=1260, skipped=13, lr=[2.2667887098113915e-05, 2.2667887098113915e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:05,458] [INFO] [timer.py:264:stop] epoch=0/micro_step=2520/global_step=1260, RunningAvgSamplesPerSec=46.496336564272966, CurrSamplesPerSec=46.48627120484043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:12,370] [INFO] [logging.py:128:log_dist] [Rank 0] step=1270, skipped=13, lr=[2.233591541945361e-05, 2.233591541945361e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:12,390] [INFO] [timer.py:264:stop] epoch=0/micro_step=2540/global_step=1270, RunningAvgSamplesPerSec=46.49631392383694, CurrSamplesPerSec=46.47926853952197, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.56, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:19,299] [INFO] [logging.py:128:log_dist] [Rank 0] step=1280, skipped=13, lr=[2.2004418232243425e-05, 2.2004418232243425e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:19,320] [INFO] [timer.py:264:stop] epoch=0/micro_step=2560/global_step=1280, RunningAvgSamplesPerSec=46.496336834420134, CurrSamplesPerSec=46.493743040656476, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:26,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=1290, skipped=13, lr=[2.1673454578370484e-05, 2.1673454578370484e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:26,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=2580/global_step=1290, RunningAvgSamplesPerSec=46.49628521737437, CurrSamplesPerSec=46.5082749070585, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:33,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=1300, skipped=13, lr=[2.1343083404695983e-05, 2.1343083404695983e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:33,205] [INFO] [timer.py:264:stop] epoch=0/micro_step=2600/global_step=1300, RunningAvgSamplesPerSec=46.49535814691071, CurrSamplesPerSec=46.25004359522548, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:40,113] [INFO] [logging.py:128:log_dist] [Rank 0] step=1310, skipped=13, lr=[2.101336355255645e-05, 2.101336355255645e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:40,134] [INFO] [timer.py:264:stop] epoch=0/micro_step=2620/global_step=1310, RunningAvgSamplesPerSec=46.495657776731484, CurrSamplesPerSec=46.5038918355933, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:47,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1320, skipped=13, lr=[2.0684353747283626e-05, 2.0684353747283626e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:47,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=2640/global_step=1320, RunningAvgSamplesPerSec=46.49561196265634, CurrSamplesPerSec=46.49445170149724, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:53,981] [INFO] [logging.py:128:log_dist] [Rank 0] step=1330, skipped=13, lr=[2.035611258774508e-05, 2.035611258774508e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:54,002] [INFO] [timer.py:264:stop] epoch=0/micro_step=2660/global_step=1330, RunningAvgSamplesPerSec=46.495620556823845, CurrSamplesPerSec=46.64968416069277, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:00,913] [INFO] [logging.py:128:log_dist] [Rank 0] step=1340, skipped=13, lr=[2.0028698535907454e-05, 2.0028698535907454e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:00,934] [INFO] [timer.py:264:stop] epoch=0/micro_step=2680/global_step=1340, RunningAvgSamplesPerSec=46.49557111930108, CurrSamplesPerSec=46.49372693497934, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:07,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=1350, skipped=13, lr=[1.970216990642385e-05, 1.970216990642385e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:07,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=2700/global_step=1350, RunningAvgSamplesPerSec=46.4952625195588, CurrSamplesPerSec=46.51009605835737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:14,772] [INFO] [logging.py:128:log_dist] [Rank 0] step=1360, skipped=13, lr=[1.9376584856247734e-05, 1.9376584856247734e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:14,793] [INFO] [timer.py:264:stop] epoch=0/micro_step=2720/global_step=1360, RunningAvgSamplesPerSec=46.49589953343763, CurrSamplesPerSec=46.493340402075496, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:21,676] [INFO] [logging.py:128:log_dist] [Rank 0] step=1370, skipped=13, lr=[1.9052001374274694e-05, 1.9052001374274694e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:21,696] [INFO] [timer.py:264:stop] epoch=0/micro_step=2740/global_step=1370, RunningAvgSamplesPerSec=46.497266423476795, CurrSamplesPerSec=46.77222560471587, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:28,578] [INFO] [logging.py:128:log_dist] [Rank 0] step=1380, skipped=13, lr=[1.8728477271014252e-05, 1.8728477271014252e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:28,598] [INFO] [timer.py:264:stop] epoch=0/micro_step=2760/global_step=1380, RunningAvgSamplesPerSec=46.498741093520614, CurrSamplesPerSec=46.613199012037946, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:35,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1390, skipped=13, lr=[1.8406070168293457e-05, 1.8406070168293457e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:35,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=2780/global_step=1390, RunningAvgSamplesPerSec=46.500554058102814, CurrSamplesPerSec=46.81554004591409, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:38,900] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:42,364] [INFO] [logging.py:128:log_dist] [Rank 0] step=1400, skipped=13, lr=[1.8084837488994006e-05, 1.8084837488994006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:42,384] [INFO] [timer.py:264:stop] epoch=0/micro_step=2800/global_step=1400, RunningAvgSamplesPerSec=46.50243920837039, CurrSamplesPerSec=46.78748663020288, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.49, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:49,284] [INFO] [logging.py:128:log_dist] [Rank 0] step=1410, skipped=13, lr=[1.7764836446824833e-05, 1.7764836446824833e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:49,304] [INFO] [timer.py:264:stop] epoch=0/micro_step=2820/global_step=1410, RunningAvgSamplesPerSec=46.50316609477529, CurrSamplesPerSec=46.82542141733676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:56,173] [INFO] [logging.py:128:log_dist] [Rank 0] step=1420, skipped=13, lr=[1.7446124036132035e-05, 1.7446124036132035e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:56,194] [INFO] [timer.py:264:stop] epoch=0/micro_step=2840/global_step=1420, RunningAvgSamplesPerSec=46.50514626952885, CurrSamplesPerSec=46.80305138832927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:03,071] [INFO] [logging.py:128:log_dist] [Rank 0] step=1430, skipped=13, lr=[1.71287570217477e-05, 1.71287570217477e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:03,091] [INFO] [timer.py:264:stop] epoch=0/micro_step=2860/global_step=1430, RunningAvgSamplesPerSec=46.50676320685188, CurrSamplesPerSec=46.848777623724516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:09,908] [INFO] [logging.py:128:log_dist] [Rank 0] step=1440, skipped=14, lr=[1.684432374584351e-05, 1.684432374584351e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:09,929] [INFO] [timer.py:264:stop] epoch=0/micro_step=2880/global_step=1440, RunningAvgSamplesPerSec=46.51114454430859, CurrSamplesPerSec=46.99939345444054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:16,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=1450, skipped=14, lr=[1.6529668505230238e-05, 1.6529668505230238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:16,818] [INFO] [timer.py:264:stop] epoch=0/micro_step=2900/global_step=1450, RunningAvgSamplesPerSec=46.51307636486264, CurrSamplesPerSec=46.83207123269369, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:23,735] [INFO] [logging.py:128:log_dist] [Rank 0] step=1460, skipped=14, lr=[1.6216521887842863e-05, 1.6216521887842863e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:23,756] [INFO] [timer.py:264:stop] epoch=0/micro_step=2920/global_step=1460, RunningAvgSamplesPerSec=46.51272734068831, CurrSamplesPerSec=46.46213298539395, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:30,666] [INFO] [logging.py:128:log_dist] [Rank 0] step=1470, skipped=14, lr=[1.59049396672081e-05, 1.59049396672081e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:30,687] [INFO] [timer.py:264:stop] epoch=0/micro_step=2940/global_step=1470, RunningAvgSamplesPerSec=46.51279988022624, CurrSamplesPerSec=46.513899984808106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:37,603] [INFO] [logging.py:128:log_dist] [Rank 0] step=1480, skipped=14, lr=[1.5594977338223077e-05, 1.5594977338223077e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:37,622] [INFO] [timer.py:264:stop] epoch=0/micro_step=2960/global_step=1480, RunningAvgSamplesPerSec=46.512692069553914, CurrSamplesPerSec=46.78629604259554, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:44,541] [INFO] [logging.py:128:log_dist] [Rank 0] step=1490, skipped=14, lr=[1.528669010727125e-05, 1.528669010727125e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:44,562] [INFO] [timer.py:264:stop] epoch=0/micro_step=2980/global_step=1490, RunningAvgSamplesPerSec=46.51222927903836, CurrSamplesPerSec=46.50253840695117, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:51,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1500, skipped=14, lr=[1.4980132882389835e-05, 1.4980132882389835e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:51,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=3000/global_step=1500, RunningAvgSamplesPerSec=46.51210125057641, CurrSamplesPerSec=46.50220006209979, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:58,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=1510, skipped=14, lr=[1.4675360263490295e-05, 1.4675360263490295e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:58,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=3020/global_step=1510, RunningAvgSamplesPerSec=46.51201324992706, CurrSamplesPerSec=46.546904191841676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:05,351] [INFO] [logging.py:128:log_dist] [Rank 0] step=1520, skipped=14, lr=[1.4372426532633664e-05, 1.4372426532633664e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:05,372] [INFO] [timer.py:264:stop] epoch=0/micro_step=3040/global_step=1520, RunningAvgSamplesPerSec=46.51129958829241, CurrSamplesPerSec=46.45904510287717, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:12,297] [INFO] [logging.py:128:log_dist] [Rank 0] step=1530, skipped=14, lr=[1.4071385644362672e-05, 1.4071385644362672e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:12,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3060/global_step=1530, RunningAvgSamplesPerSec=46.51070949284205, CurrSamplesPerSec=46.50381127215948, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:17,818] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:19,220] [INFO] [logging.py:128:log_dist] [Rank 0] step=1540, skipped=14, lr=[1.3772291216091954e-05, 1.3772291216091954e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:19,241] [INFO] [timer.py:264:stop] epoch=0/micro_step=3080/global_step=1540, RunningAvgSamplesPerSec=46.5110599747054, CurrSamplesPerSec=46.807442065172594, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:26,152] [INFO] [logging.py:128:log_dist] [Rank 0] step=1550, skipped=14, lr=[1.347519651855848e-05, 1.347519651855848e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:26,173] [INFO] [timer.py:264:stop] epoch=0/micro_step=3100/global_step=1550, RunningAvgSamplesPerSec=46.51086523978462, CurrSamplesPerSec=46.47960655075998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:33,087] [INFO] [logging.py:128:log_dist] [Rank 0] step=1560, skipped=14, lr=[1.3180154466333705e-05, 1.3180154466333705e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:33,108] [INFO] [timer.py:264:stop] epoch=0/micro_step=3120/global_step=1560, RunningAvgSamplesPerSec=46.51062351613674, CurrSamplesPerSec=46.46309803286058, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:40,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1570, skipped=14, lr=[1.2887217608399083e-05, 1.2887217608399083e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:40,071] [INFO] [timer.py:264:stop] epoch=0/micro_step=3140/global_step=1570, RunningAvgSamplesPerSec=46.50958519849321, CurrSamplesPerSec=46.503472908785604, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:46,982] [INFO] [logging.py:128:log_dist] [Rank 0] step=1580, skipped=14, lr=[1.2596438118786732e-05, 1.2596438118786732e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:47,003] [INFO] [timer.py:264:stop] epoch=0/micro_step=3160/global_step=1580, RunningAvgSamplesPerSec=46.50945444674376, CurrSamplesPerSec=46.49416179217857, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.52, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.88, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:53,931] [INFO] [logging.py:128:log_dist] [Rank 0] step=1590, skipped=14, lr=[1.2307867787286942e-05, 1.2307867787286942e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:53,951] [INFO] [timer.py:264:stop] epoch=0/micro_step=3180/global_step=1590, RunningAvgSamplesPerSec=46.50879849785462, CurrSamplesPerSec=46.54303030602095, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:00,889] [INFO] [logging.py:128:log_dist] [Rank 0] step=1600, skipped=14, lr=[1.2021558010224001e-05, 1.2021558010224001e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:00,908] [INFO] [timer.py:264:stop] epoch=0/micro_step=3200/global_step=1600, RunningAvgSamplesPerSec=46.50777483497731, CurrSamplesPerSec=46.43972299579574, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.53, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:07,814] [INFO] [logging.py:128:log_dist] [Rank 0] step=1610, skipped=14, lr=[1.1737559781302185e-05, 1.1737559781302185e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:07,835] [INFO] [timer.py:264:stop] epoch=0/micro_step=3220/global_step=1610, RunningAvgSamplesPerSec=46.50800788060955, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:14,759] [INFO] [logging.py:128:log_dist] [Rank 0] step=1620, skipped=14, lr=[1.1455923682523475e-05, 1.1455923682523475e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:14,778] [INFO] [timer.py:264:stop] epoch=0/micro_step=3240/global_step=1620, RunningAvgSamplesPerSec=46.507528936833005, CurrSamplesPerSec=45.83951018076546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:21,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=1630, skipped=14, lr=[1.1176699875178485e-05, 1.1176699875178485e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:21,735] [INFO] [timer.py:264:stop] epoch=0/micro_step=3260/global_step=1630, RunningAvgSamplesPerSec=46.5067150409607, CurrSamplesPerSec=46.487349963813216, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,250] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,251] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:28,650] [INFO] [logging.py:128:log_dist] [Rank 0] step=1640, skipped=14, lr=[1.0899938090912464e-05, 1.0899938090912464e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:28,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3280/global_step=1640, RunningAvgSamplesPerSec=46.50646437840029, CurrSamplesPerSec=46.486061899197516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:35,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1650, skipped=14, lr=[1.0625687622867731e-05, 1.0625687622867731e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:35,601] [INFO] [timer.py:264:stop] epoch=0/micro_step=3300/global_step=1650, RunningAvgSamplesPerSec=46.506480546092504, CurrSamplesPerSec=46.49532145114602, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:40,393] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,395] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:42,454] [INFO] [logging.py:128:log_dist] [Rank 0] step=1660, skipped=15, lr=[1.038104975748232e-05, 1.038104975748232e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:42,473] [INFO] [timer.py:264:stop] epoch=0/micro_step=3320/global_step=1660, RunningAvgSamplesPerSec=46.5088144037295, CurrSamplesPerSec=46.2695112142726, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:49,394] [INFO] [logging.py:128:log_dist] [Rank 0] step=1670, skipped=15, lr=[1.011170498391135e-05, 1.011170498391135e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:49,413] [INFO] [timer.py:264:stop] epoch=0/micro_step=3340/global_step=1670, RunningAvgSamplesPerSec=46.508717776579374, CurrSamplesPerSec=46.62398308028986, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:56,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=1680, skipped=15, lr=[9.845011916199696e-06, 9.845011916199696e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:56,346] [INFO] [timer.py:264:stop] epoch=0/micro_step=3360/global_step=1680, RunningAvgSamplesPerSec=46.50875869239154, CurrSamplesPerSec=46.45526622191193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:03,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=1690, skipped=15, lr=[9.581018054183269e-06, 9.581018054183269e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:03,276] [INFO] [timer.py:264:stop] epoch=0/micro_step=3380/global_step=1690, RunningAvgSamplesPerSec=46.50870012606984, CurrSamplesPerSec=46.556898566121795, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:10,185] [INFO] [logging.py:128:log_dist] [Rank 0] step=1700, skipped=15, lr=[9.31977041695123e-06, 9.31977041695123e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:10,206] [INFO] [timer.py:264:stop] epoch=0/micro_step=3400/global_step=1700, RunningAvgSamplesPerSec=46.508622175904215, CurrSamplesPerSec=46.45679378061621, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:17,116] [INFO] [logging.py:128:log_dist] [Rank 0] step=1710, skipped=15, lr=[9.061315534471568e-06, 9.061315534471568e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:17,137] [INFO] [timer.py:264:stop] epoch=0/micro_step=3420/global_step=1710, RunningAvgSamplesPerSec=46.50870541176424, CurrSamplesPerSec=46.7551991450285, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.86, Samples/sec: 13.23, Time/seq 0.08s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:24,056] [INFO] [logging.py:128:log_dist] [Rank 0] step=1720, skipped=15, lr=[8.805699439303772e-06, 8.805699439303772e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:24,077] [INFO] [timer.py:264:stop] epoch=0/micro_step=3440/global_step=1720, RunningAvgSamplesPerSec=46.50831570128103, CurrSamplesPerSec=46.47883396086827, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:30,991] [INFO] [logging.py:128:log_dist] [Rank 0] step=1730, skipped=15, lr=[8.552967658400174e-06, 8.552967658400174e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:31,012] [INFO] [timer.py:264:stop] epoch=0/micro_step=3460/global_step=1730, RunningAvgSamplesPerSec=46.50817874207723, CurrSamplesPerSec=46.92904261360235, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:37,918] [INFO] [logging.py:128:log_dist] [Rank 0] step=1740, skipped=15, lr=[8.303165204997231e-06, 8.303165204997231e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:37,939] [INFO] [timer.py:264:stop] epoch=0/micro_step=3480/global_step=1740, RunningAvgSamplesPerSec=46.5082478561663, CurrSamplesPerSec=46.53083177923073, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.55, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:44,850] [INFO] [logging.py:128:log_dist] [Rank 0] step=1750, skipped=15, lr=[8.056336570598434e-06, 8.056336570598434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:44,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=3500/global_step=1750, RunningAvgSamplesPerSec=46.50811673391772, CurrSamplesPerSec=46.45566820130503, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:51,779] [INFO] [logging.py:128:log_dist] [Rank 0] step=1760, skipped=15, lr=[7.812525717049999e-06, 7.812525717049999e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:51,799] [INFO] [timer.py:264:stop] epoch=0/micro_step=3520/global_step=1760, RunningAvgSamplesPerSec=46.50828314448669, CurrSamplesPerSec=46.52642832694709, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:58,711] [INFO] [logging.py:128:log_dist] [Rank 0] step=1770, skipped=15, lr=[7.571776068710998e-06, 7.571776068710998e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:58,731] [INFO] [timer.py:264:stop] epoch=0/micro_step=3540/global_step=1770, RunningAvgSamplesPerSec=46.50829869356961, CurrSamplesPerSec=46.50495529908099, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:05,651] [INFO] [logging.py:128:log_dist] [Rank 0] step=1780, skipped=15, lr=[7.334130504719211e-06, 7.334130504719211e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:05,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3560/global_step=1780, RunningAvgSamplesPerSec=46.507906608593274, CurrSamplesPerSec=46.49524091740414, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:12,577] [INFO] [logging.py:128:log_dist] [Rank 0] step=1790, skipped=15, lr=[7.099631351354036e-06, 7.099631351354036e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:12,597] [INFO] [timer.py:264:stop] epoch=0/micro_step=3580/global_step=1790, RunningAvgSamplesPerSec=46.50816179436258, CurrSamplesPerSec=46.53826953800675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:19,505] [INFO] [logging.py:128:log_dist] [Rank 0] step=1800, skipped=15, lr=[6.868320374497869e-06, 6.868320374497869e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:19,526] [INFO] [timer.py:264:stop] epoch=0/micro_step=3600/global_step=1800, RunningAvgSamplesPerSec=46.50819381179044, CurrSamplesPerSec=46.46337147026462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:26,432] [INFO] [logging.py:128:log_dist] [Rank 0] step=1810, skipped=15, lr=[6.64023877219738e-06, 6.64023877219738e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:26,452] [INFO] [timer.py:264:stop] epoch=0/micro_step=3620/global_step=1810, RunningAvgSamplesPerSec=46.5083076047805, CurrSamplesPerSec=46.491520562684975, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.91, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.92, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:33,369] [INFO] [logging.py:128:log_dist] [Rank 0] step=1820, skipped=15, lr=[6.415427167325794e-06, 6.415427167325794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:33,389] [INFO] [timer.py:264:stop] epoch=0/micro_step=3640/global_step=1820, RunningAvgSamplesPerSec=46.50816781684397, CurrSamplesPerSec=46.81537675249691, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:40,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=1830, skipped=15, lr=[6.19392560034775e-06, 6.19392560034775e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:40,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3660/global_step=1830, RunningAvgSamplesPerSec=46.50820709570212, CurrSamplesPerSec=46.5024095130939, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:47,228] [INFO] [logging.py:128:log_dist] [Rank 0] step=1840, skipped=15, lr=[5.975773522187763e-06, 5.975773522187763e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:47,248] [INFO] [timer.py:264:stop] epoch=0/micro_step=3680/global_step=1840, RunningAvgSamplesPerSec=46.508121168997306, CurrSamplesPerSec=46.481586429598636, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:54,034] [INFO] [logging.py:128:log_dist] [Rank 0] step=1850, skipped=17, lr=[5.803689621959219e-06, 5.803689621959219e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:54,054] [INFO] [timer.py:264:stop] epoch=0/micro_step=3700/global_step=1850, RunningAvgSamplesPerSec=46.51260312244167, CurrSamplesPerSec=46.48744657153694, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:00,972] [INFO] [logging.py:128:log_dist] [Rank 0] step=1860, skipped=17, lr=[5.5916641313404325e-06, 5.5916641313404325e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:00,991] [INFO] [timer.py:264:stop] epoch=0/micro_step=3720/global_step=1860, RunningAvgSamplesPerSec=46.51238471774673, CurrSamplesPerSec=46.237791067849386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:07,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=1870, skipped=17, lr=[5.383095396373447e-06, 5.383095396373447e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:07,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=3740/global_step=1870, RunningAvgSamplesPerSec=46.51254589556029, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:14,834] [INFO] [logging.py:128:log_dist] [Rank 0] step=1880, skipped=17, lr=[5.178020564558106e-06, 5.178020564558106e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:14,855] [INFO] [timer.py:264:stop] epoch=0/micro_step=3760/global_step=1880, RunningAvgSamplesPerSec=46.51221216844289, CurrSamplesPerSec=46.48469340875124, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:21,777] [INFO] [logging.py:128:log_dist] [Rank 0] step=1890, skipped=17, lr=[4.976476161106478e-06, 4.976476161106478e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=3780/global_step=1890, RunningAvgSamplesPerSec=46.512024339691905, CurrSamplesPerSec=46.72973962932564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:28,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=1900, skipped=17, lr=[4.778498082437544e-06, 4.778498082437544e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:28,738] [INFO] [timer.py:264:stop] epoch=0/micro_step=3800/global_step=1900, RunningAvgSamplesPerSec=46.511614340757966, CurrSamplesPerSec=45.800076335051905, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:35,653] [INFO] [logging.py:128:log_dist] [Rank 0] step=1910, skipped=17, lr=[4.584121589783738e-06, 4.584121589783738e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:35,674] [INFO] [timer.py:264:stop] epoch=0/micro_step=3820/global_step=1910, RunningAvgSamplesPerSec=46.51156047823584, CurrSamplesPerSec=46.491182378142085, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:42,583] [INFO] [logging.py:128:log_dist] [Rank 0] step=1920, skipped=17, lr=[4.39338130291071e-06, 4.39338130291071e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:42,604] [INFO] [timer.py:264:stop] epoch=0/micro_step=3840/global_step=1920, RunningAvgSamplesPerSec=46.51154169085933, CurrSamplesPerSec=46.508935662282894, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:49,550] [INFO] [logging.py:128:log_dist] [Rank 0] step=1930, skipped=17, lr=[4.206311193951332e-06, 4.206311193951332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:49,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=3860/global_step=1930, RunningAvgSamplesPerSec=46.51063862305707, CurrSamplesPerSec=46.89421645226965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:56,483] [INFO] [logging.py:128:log_dist] [Rank 0] step=1940, skipped=17, lr=[4.022944581354981e-06, 4.022944581354981e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:56,503] [INFO] [timer.py:264:stop] epoch=0/micro_step=3880/global_step=1940, RunningAvgSamplesPerSec=46.51063920775755, CurrSamplesPerSec=46.44960709050509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:03,413] [INFO] [logging.py:128:log_dist] [Rank 0] step=1950, skipped=17, lr=[3.843314123953354e-06, 3.843314123953354e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:03,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=3900/global_step=1950, RunningAvgSamplesPerSec=46.51056111915757, CurrSamplesPerSec=46.50838771759699, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:10,344] [INFO] [logging.py:128:log_dist] [Rank 0] step=1960, skipped=17, lr=[3.6674518151436744e-06, 3.6674518151436744e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:10,364] [INFO] [timer.py:264:stop] epoch=0/micro_step=3920/global_step=1960, RunningAvgSamplesPerSec=46.51048027244313, CurrSamplesPerSec=46.48772029560139, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:17,273] [INFO] [logging.py:128:log_dist] [Rank 0] step=1970, skipped=17, lr=[3.4953889771904475e-06, 3.4953889771904475e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:17,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=3940/global_step=1970, RunningAvgSamplesPerSec=46.51043553683364, CurrSamplesPerSec=46.44023718641015, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:24,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=1980, skipped=18, lr=[3.343806313235337e-06, 3.343806313235337e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:24,163] [INFO] [timer.py:264:stop] epoch=0/micro_step=3960/global_step=1980, RunningAvgSamplesPerSec=46.51242203854685, CurrSamplesPerSec=46.519187831305196, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:31,073] [INFO] [logging.py:128:log_dist] [Rank 0] step=1990, skipped=18, lr=[3.1790463347058543e-06, 3.1790463347058543e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:31,094] [INFO] [timer.py:264:stop] epoch=0/micro_step=3980/global_step=1990, RunningAvgSamplesPerSec=46.512401921343596, CurrSamplesPerSec=46.516979041899226, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:38,016] [INFO] [logging.py:128:log_dist] [Rank 0] step=2000, skipped=18, lr=[3.0181728153463233e-06, 3.0181728153463233e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:38,035] [INFO] [timer.py:264:stop] epoch=0/micro_step=4000/global_step=2000, RunningAvgSamplesPerSec=46.51207795653655, CurrSamplesPerSec=45.77321075988233, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:44,970] [INFO] [logging.py:128:log_dist] [Rank 0] step=2010, skipped=18, lr=[2.8612144078166593e-06, 2.8612144078166593e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:44,990] [INFO] [timer.py:264:stop] epoch=0/micro_step=4020/global_step=2010, RunningAvgSamplesPerSec=46.511671133770086, CurrSamplesPerSec=46.50012176602313, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:51,899] [INFO] [logging.py:128:log_dist] [Rank 0] step=2020, skipped=18, lr=[2.708199067468939e-06, 2.708199067468939e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:51,919] [INFO] [timer.py:264:stop] epoch=0/micro_step=4040/global_step=2020, RunningAvgSamplesPerSec=46.5116397897557, CurrSamplesPerSec=46.49754429253596, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:58,848] [INFO] [logging.py:128:log_dist] [Rank 0] step=2030, skipped=18, lr=[2.5591540473683453e-06, 2.5591540473683453e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:58,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=4060/global_step=2030, RunningAvgSamplesPerSec=46.511126243282945, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:05,780] [INFO] [logging.py:128:log_dist] [Rank 0] step=2040, skipped=18, lr=[2.414105893439225e-06, 2.414105893439225e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:05,801] [INFO] [timer.py:264:stop] epoch=0/micro_step=4080/global_step=2040, RunningAvgSamplesPerSec=46.5110025907912, CurrSamplesPerSec=46.43461384529517, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:12,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=2050, skipped=18, lr=[2.2730804397370688e-06, 2.2730804397370688e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:12,764] [INFO] [timer.py:264:stop] epoch=0/micro_step=4100/global_step=2050, RunningAvgSamplesPerSec=46.51005633195471, CurrSamplesPerSec=46.5211065850435, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:19,691] [INFO] [logging.py:128:log_dist] [Rank 0] step=2060, skipped=18, lr=[2.1361028038473034e-06, 2.1361028038473034e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:19,707] [INFO] [timer.py:264:stop] epoch=0/micro_step=4120/global_step=2060, RunningAvgSamplesPerSec=46.509589050477125, CurrSamplesPerSec=45.671672575913625, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:26,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=2070, skipped=18, lr=[2.003197382411673e-06, 2.003197382411673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:26,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=4140/global_step=2070, RunningAvgSamplesPerSec=46.509808656833094, CurrSamplesPerSec=46.522928741415484, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,062] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,063] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:33,543] [INFO] [logging.py:128:log_dist] [Rank 0] step=2080, skipped=18, lr=[1.8743878467830294e-06, 1.8743878467830294e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:33,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=4160/global_step=2080, RunningAvgSamplesPerSec=46.50968619314036, CurrSamplesPerSec=46.45285449163494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from [2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:40,414] [INFO] [logging.py:128:log_dist] [Rank 0] step=2090, skipped=19, lr=[1.7619802236591875e-06, 1.7619802236591875e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:40,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=4180/global_step=2090, RunningAvgSamplesPerSec=46.51158895789503, CurrSamplesPerSec=46.4917138132042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:47,345] [INFO] [logging.py:128:log_dist] [Rank 0] step=2100, skipped=19, lr=[1.6410154696242603e-06, 1.6410154696242603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:47,365] [INFO] [timer.py:264:stop] epoch=0/micro_step=4200/global_step=2100, RunningAvgSamplesPerSec=46.51148902071231, CurrSamplesPerSec=46.4236281604466, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:54,277] [INFO] [logging.py:128:log_dist] [Rank 0] step=2110, skipped=19, lr=[1.5242111084402238e-06, 1.5242111084402238e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:54,297] [INFO] [timer.py:264:stop] epoch=0/micro_step=4220/global_step=2110, RunningAvgSamplesPerSec=46.511339445793595, CurrSamplesPerSec=46.41922892018923, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:01,207] [INFO] [logging.py:128:log_dist] [Rank 0] step=2120, skipped=19, lr=[1.4115879437524043e-06, 1.4115879437524043e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:01,228] [INFO] [timer.py:264:stop] epoch=0/micro_step=4240/global_step=2120, RunningAvgSamplesPerSec=46.51130295982599, CurrSamplesPerSec=46.511304865842796, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:08,136] [INFO] [logging.py:128:log_dist] [Rank 0] step=2130, skipped=19, lr=[1.3031660345068002e-06, 1.3031660345068002e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:08,156] [INFO] [timer.py:264:stop] epoch=0/micro_step=4260/global_step=2130, RunningAvgSamplesPerSec=46.51131404937642, CurrSamplesPerSec=46.51030558048654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:15,059] [INFO] [logging.py:128:log_dist] [Rank 0] step=2140, skipped=19, lr=[1.1989646913774466e-06, 1.1989646913774466e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:15,080] [INFO] [timer.py:264:stop] epoch=0/micro_step=4280/global_step=2140, RunningAvgSamplesPerSec=46.5114519203876, CurrSamplesPerSec=46.79242903203494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:22,020] [INFO] [logging.py:128:log_dist] [Rank 0] step=2150, skipped=19, lr=[1.0990024733270572e-06, 1.0990024733270572e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:22,040] [INFO] [timer.py:264:stop] epoch=0/micro_step=4300/global_step=2150, RunningAvgSamplesPerSec=46.51052237053515, CurrSamplesPerSec=46.532267517101225, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:28,947] [INFO] [logging.py:128:log_dist] [Rank 0] step=2160, skipped=19, lr=[1.0032971843015576e-06, 1.0032971843015576e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:28,967] [INFO] [timer.py:264:stop] epoch=0/micro_step=4320/global_step=2160, RunningAvgSamplesPerSec=46.5105816299913, CurrSamplesPerSec=46.438469703874325, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:35,881] [INFO] [logging.py:128:log_dist] [Rank 0] step=2170, skipped=19, lr=[9.118658700590616e-07, 9.118658700590616e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:35,902] [INFO] [timer.py:264:stop] epoch=0/micro_step=4340/global_step=2170, RunningAvgSamplesPerSec=46.510394643211036, CurrSamplesPerSec=46.47535762417345, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:42,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=2180, skipped=19, lr=[8.247248151339343e-07, 8.247248151339343e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:42,828] [INFO] [timer.py:264:stop] epoch=0/micro_step=4360/global_step=2180, RunningAvgSamplesPerSec=46.510493742010965, CurrSamplesPerSec=46.50587378396359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:46,253] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:49,737] [INFO] [logging.py:128:log_dist] [Rank 0] step=2190, skipped=19, lr=[7.418895399363746e-07, 7.418895399363746e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:49,758] [INFO] [timer.py:264:stop] epoch=0/micro_step=4380/global_step=2190, RunningAvgSamplesPerSec=46.510429443817735, CurrSamplesPerSec=46.49342092923379, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:56,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2200, skipped=19, lr=[6.633747979881533e-07, 6.633747979881533e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:56,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=4400/global_step=2200, RunningAvgSamplesPerSec=46.51038262533071, CurrSamplesPerSec=46.55770605348971, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:03,604] [INFO] [logging.py:128:log_dist] [Rank 0] step=2210, skipped=19, lr=[5.891945732949017e-07, 5.891945732949017e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:03,623] [INFO] [timer.py:264:stop] epoch=0/micro_step=4420/global_step=2210, RunningAvgSamplesPerSec=46.51031258811781, CurrSamplesPerSec=46.33170675886358, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:10,546] [INFO] [logging.py:128:log_dist] [Rank 0] step=2220, skipped=19, lr=[5.193620778554536e-07, 5.193620778554536e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:10,567] [INFO] [timer.py:264:stop] epoch=0/micro_step=4440/global_step=2220, RunningAvgSamplesPerSec=46.509899108741436, CurrSamplesPerSec=46.37364797007833, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:17,446] [INFO] [logging.py:128:log_dist] [Rank 0] step=2230, skipped=19, lr=[4.538897493087113e-07, 4.538897493087113e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:17,467] [INFO] [timer.py:264:stop] epoch=0/micro_step=4460/global_step=2230, RunningAvgSamplesPerSec=46.51082145039772, CurrSamplesPerSec=46.73201748416546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:24,357] [INFO] [logging.py:128:log_dist] [Rank 0] step=2240, skipped=19, lr=[3.927892487184254e-07, 3.927892487184254e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:24,377] [INFO] [timer.py:264:stop] epoch=0/micro_step=4480/global_step=2240, RunningAvgSamplesPerSec=46.51141811493153, CurrSamplesPerSec=46.78347475499735, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:31,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=2250, skipped=19, lr=[3.360714584962621e-07, 3.360714584962621e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:31,275] [INFO] [timer.py:264:stop] epoch=0/micro_step=4500/global_step=2250, RunningAvgSamplesPerSec=46.512457104244405, CurrSamplesPerSec=46.71571944298993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:38,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=2260, skipped=19, lr=[2.83746480463587e-07, 2.83746480463587e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:38,164] [INFO] [timer.py:264:stop] epoch=0/micro_step=4520/global_step=2260, RunningAvgSamplesPerSec=46.51361995388357, CurrSamplesPerSec=46.76160094278518, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:45,038] [INFO] [logging.py:128:log_dist] [Rank 0] step=2270, skipped=19, lr=[2.3582363405225405e-07, 2.3582363405225405e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:45,059] [INFO] [timer.py:264:stop] epoch=0/micro_step=4540/global_step=2270, RunningAvgSamplesPerSec=46.514651946554, CurrSamplesPerSec=46.784420582288035, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:51,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=2280, skipped=19, lr=[1.9231145464475297e-07, 1.9231145464475297e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:51,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=4560/global_step=2280, RunningAvgSamplesPerSec=46.51576697162558, CurrSamplesPerSec=46.77840381607327, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:58,763] [INFO] [logging.py:128:log_dist] [Rank 0] step=2290, skipped=20, lr=[1.5692803494115337e-07, 1.5692803494115337e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:58,783] [INFO] [timer.py:264:stop] epoch=0/micro_step=4580/global_step=2290, RunningAvgSamplesPerSec=46.51851714180861, CurrSamplesPerSec=46.79658928835533, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:05,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2300, skipped=20, lr=[1.2181682318773424e-07, 1.2181682318773424e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:05,691] [INFO] [timer.py:264:stop] epoch=0/micro_step=4600/global_step=2300, RunningAvgSamplesPerSec=46.519098342938314, CurrSamplesPerSec=46.81861017418855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:12,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=2310, skipped=20, lr=[9.11365838208722e-08, 9.11365838208722e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:12,579] [INFO] [timer.py:264:stop] epoch=0/micro_step=4620/global_step=2310, RunningAvgSamplesPerSec=46.520263336208856, CurrSamplesPerSec=46.77643117546071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:19,386] [INFO] [logging.py:128:log_dist] [Rank 0] step=2320, skipped=21, lr=[6.731738239555418e-08, 6.731738239555418e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:19,407] [INFO] [timer.py:264:stop] epoch=0/micro_step=4640/global_step=2320, RunningAvgSamplesPerSec=46.52321387815545, CurrSamplesPerSec=46.80081556093043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:26,272] [INFO] [logging.py:128:log_dist] [Rank 0] step=2330, skipped=21, lr=[4.507039181994299e-08, 4.507039181994299e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:26,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=4660/global_step=2330, RunningAvgSamplesPerSec=46.52443592782957, CurrSamplesPerSec=46.81611158184509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:33,160] [INFO] [logging.py:128:log_dist] [Rank 0] step=2340, skipped=21, lr=[2.726804268846084e-08, 2.726804268846084e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:33,181] [INFO] [timer.py:264:stop] epoch=0/micro_step=4680/global_step=2340, RunningAvgSamplesPerSec=46.52560854377931, CurrSamplesPerSec=46.75790299843455, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:40,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=2350, skipped=21, lr=[1.3913505719678755e-08, 1.3913505719678755e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:40,067] [INFO] [timer.py:264:stop] epoch=0/micro_step=4700/global_step=2350, RunningAvgSamplesPerSec=46.526791452423346, CurrSamplesPerSec=46.81642185005165, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +***** Evaluating rewards, Epoch 1/1 ***** +chosen: -31.9375, rejected: -33.96875, loss: 0.6650807857513428 +saving the final model ... +[rank0]:[W102 16:15:18.696314488 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) +[2025-01-02 16:15:21,891] [INFO] [launch.py:351:main] Process 106033 exits successfully. +[2025-01-02 16:15:22,894] [INFO] [launch.py:351:main] Process 106032 exits successfully. +[2025-01-02 16:15:22,895] [INFO] [launch.py:351:main] Process 106038 exits successfully. +[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106031 exits successfully. +[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106037 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106034 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106036 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106035 exits successfully.