Skip to content

Commit

Permalink
updating to fused (#2293)
Browse files Browse the repository at this point in the history
  • Loading branch information
SalmanMohammadi authored Jan 30, 2025
1 parent 8779997 commit ac471a6
Show file tree
Hide file tree
Showing 37 changed files with 51 additions and 51 deletions.
2 changes: 1 addition & 1 deletion examples/cerebras/btlm-ft.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ output_dir: ./outputs/btlm-out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_eps: 0.000000001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/deepseek-v2/fft-fsdp-16b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

Expand Down
2 changes: 1 addition & 1 deletion examples/deepseek-v2/qlora-fsdp-2_5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ peft_use_rslora: true
gradient_accumulation_steps: 1
micro_batch_size: 8
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

Expand Down
2 changes: 1 addition & 1 deletion examples/jamba/qlora_fsdp_large.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ lora_target_linear: false
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

Expand Down
2 changes: 1 addition & 1 deletion examples/llama-2/gptq-lora.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ output_dir: ./outputs/model-out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_eps: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/llama-2/qlora-fsdp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

Expand Down
2 changes: 1 addition & 1 deletion examples/llama-3/fft-8b-liger-fsdp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

Expand Down
2 changes: 1 addition & 1 deletion examples/llama-3/qlora-fsdp-405b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ lora_target_linear: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

Expand Down
2 changes: 1 addition & 1 deletion examples/llama-3/qlora-fsdp-70b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

Expand Down
2 changes: 1 addition & 1 deletion examples/mistral/lora-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

Expand Down
2 changes: 1 addition & 1 deletion examples/mistral/mixtral-8x22b-qlora-fsdp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

Expand Down
2 changes: 1 addition & 1 deletion examples/mistral/mixtral-qlora-fsdp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

Expand Down
2 changes: 1 addition & 1 deletion examples/phi/phi-ft.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/phi/phi-qlora.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/phi/phi2-ft.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/phi/phi3-ft-fsdp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 12
num_epochs: 2
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/phi/phi3-ft.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ lora_fan_in_fan_out:
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
Expand Down
2 changes: 1 addition & 1 deletion examples/qwen2/qlora-fsdp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

Expand Down
2 changes: 1 addition & 1 deletion examples/tiny-llama/lora-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_trainer_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def fixture_cfg():
"output_dir": "./model-out",
"warmup_steps": 10,
"gradient_checkpointing": False,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"sequence_len": 2048,
"rl": True,
"adam_beta1": 0.998,
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/integrations/test_cut_cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def min_cfg(temp_dir):
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"output_dir": temp_dir,
"lr_scheduler": "cosine",
"save_safetensors": True,
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/integrations/test_liger.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_llama_wo_flce(self, temp_dir):
"gradient_accumulation_steps": 2,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"save_safetensors": True,
"bf16": "auto",
Expand Down Expand Up @@ -93,7 +93,7 @@ def test_llama_w_flce(self, temp_dir):
"gradient_accumulation_steps": 2,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"save_safetensors": True,
"bf16": "auto",
Expand Down
12 changes: 6 additions & 6 deletions tests/e2e/multigpu/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
Expand Down Expand Up @@ -401,7 +401,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
"gradient_accumulation_steps": 4,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
Expand Down Expand Up @@ -480,7 +480,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
"gradient_accumulation_steps": 4,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
Expand Down Expand Up @@ -575,7 +575,7 @@ def test_ds_zero3_packed(
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
Expand Down Expand Up @@ -648,7 +648,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
Expand Down Expand Up @@ -721,7 +721,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_qlora_fsdp_dpo(self, base_model, temp_dir):
"gradient_accumulation_steps": 2,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"bf16": "auto",
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/patched/test_4d_multipack_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_sdp_lora_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_torch_lora_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/patched/test_fused_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_fft_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 10,
"save_steps": 5,
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/patched/test_llama_s2_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_lora_s2_attn(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 10,
"save_steps": 5,
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_fft_s2_attn(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 10,
"save_steps": 5,
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/patched/test_lora_llama_multipack.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_lora_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
}
)
Expand Down Expand Up @@ -116,7 +116,7 @@ def test_lora_gptq_packed(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
}
)
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/patched/test_mistral_samplepack.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_lora_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_ft_packing(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/test_embeddings_lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_train_w_embedding_lr_scale(self, temp_dir):
"val_set_size": 0.0,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"embedding_lr_scale": 0.5,
"lr_scheduler": "cosine",
"save_safetensors": True,
Expand Down Expand Up @@ -92,7 +92,7 @@ def test_train_w_embedding_lr(self, temp_dir):
"val_set_size": 0.0,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"embedding_lr": 0.000005,
"lr_scheduler": "cosine",
"save_safetensors": True,
Expand Down
6 changes: 3 additions & 3 deletions tests/e2e/test_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_lora(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down Expand Up @@ -110,7 +110,7 @@ def test_lora_added_vocab(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down Expand Up @@ -149,7 +149,7 @@ def test_ft(self, temp_dir):
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"max_steps": 20,
"save_steps": 10,
Expand Down
Loading

0 comments on commit ac471a6

Please sign in to comment.