diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml index 81cfe3f6..4d5a325e 100644 --- a/configs/1B/H100.toml +++ b/configs/1B/H100.toml @@ -3,10 +3,11 @@ project = "debug_1B_zero_band" type_model = "llama2" [train] -micro_bs = 16 +micro_bs = 32 +reshard_after_forward = true [optim] -batch_size = 2048 +batch_size = 1024 warmup_steps = 1000 -total_steps = 88_000 -lr = 4e-4 \ No newline at end of file +total_steps = 8192 +lr = 7e-4 \ No newline at end of file diff --git a/configs/1B/H100_c4.toml b/configs/1B/H100_c4.toml deleted file mode 100644 index 1695017d..00000000 --- a/configs/1B/H100_c4.toml +++ /dev/null @@ -1,15 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 16 - -[optim] -batch_size = 128 -warmup_steps = 1000 -total_steps = 88_000 -lr = 3e-4 - -[data] -seq_length = 2048 \ No newline at end of file diff --git a/configs/1B/H100_llama2_edu.toml b/configs/1B/H100_llama2_edu.toml deleted file mode 100644 index 31eb3a32..00000000 --- a/configs/1B/H100_llama2_edu.toml +++ /dev/null @@ -1,21 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 4 -reshard_after_forward = true - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -lr = 4e-4 -z_loss = true diff --git a/configs/1B/H100_llama2_edu_no_feat.toml b/configs/1B/H100_llama2_edu_no_feat.toml deleted file mode 100644 index 0afd432f..00000000 --- a/configs/1B/H100_llama2_edu_no_feat.toml +++ /dev/null @@ -1,23 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 4 -reshard_after_forward = true -attn_fn = "sdpa" -sequence_packing = false - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -lr = 2e-4 -z_loss = false diff --git a/configs/1B/H100_llama3.toml b/configs/1B/H100_llama3.toml deleted file mode 100644 index d4b3ee23..00000000 --- a/configs/1B/H100_llama3.toml +++ /dev/null @@ -1,22 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama3" - -[train] -micro_bs = 1 -reshard_after_forward = true - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -lr = 4e-4 -z_loss = true -