Merge pull request rui-ye#19 from rui-ye/training

[rui/training] implement QLoRA for better 4bit quantization
saadiabadi · Mar 30, 2024 · 859865a · 859865a
2 parents d45644d + d9f8087
commit 859865a
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+training_scripts/*
+!training_scripts/run_sft.sh
+!training_scripts/run_dpo.sh
+
 evaluation/open_ended/data/*/model_answer/*
 evaluation/open_ended/data/*/model_judgment/*
 

diff --git a/config.py b/config.py
@@ -22,6 +22,7 @@ class FedArguments:
     fedopt_eta: Optional[float] = field(default=1e-3, metadata={"help": "the global learning rate parameter of FedAdagrad, FedYogi and FedAdam"})
     fedopt_beta1: Optional[float] = field(default=0.9, metadata={"help": "the beta1 parameter of FedYogi and FedAdam"})
     fedopt_beta2: Optional[float] = field(default=0.99, metadata={"help": "the beta2 parameter of FedYogi and FedAdam"})
+    save_model_freq: Optional[int] = field(default=50, metadata={"help": "the frequency to save the model. 50 means save every 50 rounds"})
 
 @dataclass
 class ScriptArguments:
@@ -102,9 +103,19 @@ def get_training_args(script_args, new_lr):
 def get_model_config(script_args):
     if script_args.load_in_8bit and script_args.load_in_4bit:
         raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
-    elif script_args.load_in_8bit or script_args.load_in_4bit:
+    elif script_args.load_in_8bit:
         quantization_config = BitsAndBytesConfig(
-            load_in_8bit=script_args.load_in_8bit, load_in_4bit=script_args.load_in_4bit
+            load_in_8bit=script_args.load_in_8bit
+        )
+        # Copy the model to each device
+        device_map = {"": Accelerator().local_process_index}
+        torch_dtype = torch.bfloat16
+    elif script_args.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=script_args.load_in_4bit,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
         )
         # Copy the model to each device
         device_map = {"": Accelerator().local_process_index}

diff --git a/main_dpo.py b/main_dpo.py
@@ -61,6 +61,11 @@
 model = get_peft_model(model, peft_config)
 model.print_trainable_parameters()
 
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+
+if training_args.gradient_checkpointing:
+    model.enable_input_require_grads()
+
 # ===== Define the global and local models =====
 global_dict = copy.deepcopy(get_peft_model_state_dict(model))
 local_dict_list = [copy.deepcopy(global_dict) for i in range(fed_args.num_clients)]
@@ -115,7 +120,7 @@
     set_peft_model_state_dict(model, global_dict)   # update global model
 
     # ===== Save the model =====
-    if (round+1) % 50 == 0:
+    if (round+1) % fed_args.save_model_freq == 0:
         trainer.save_model(os.path.join(script_args.output_dir, f"checkpoint-{round+1}"))
 
     np.save(os.path.join(script_args.output_dir, "training_loss.npy"), np.array(training_loss))
diff --git a/main_sft.py b/main_sft.py
@@ -44,6 +44,11 @@
 model = get_peft_model(model, peft_config)
 model.print_trainable_parameters()
 
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+
+if training_args.gradient_checkpointing:
+    model.enable_input_require_grads()
+
 # ===== Define the global and local models =====
 global_dict = copy.deepcopy(get_peft_model_state_dict(model))
 local_dict_list = [copy.deepcopy(global_dict) for i in range(fed_args.num_clients)]
@@ -114,7 +119,7 @@
     set_peft_model_state_dict(model, global_dict)   # Update global model
 
     # ===== Save the model =====
-    if (round+1) % 50 == 0:
+    if (round+1) % fed_args.save_model_freq == 0:
         trainer.save_model(os.path.join(script_args.output_dir, f"checkpoint-{round+1}"))
 
     np.save(os.path.join(script_args.output_dir, "training_loss.npy"), np.array(training_loss))