From deb1756b7bdd83ed1e148cc55ea154b058e35740 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Fri, 5 Jul 2024 09:13:43 +0000 Subject: [PATCH 01/12] fix default tokenize function for enabling padding --- llm_on_ray/finetune/data_process.py | 48 ++++++++++------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/llm_on_ray/finetune/data_process.py b/llm_on_ray/finetune/data_process.py index 6435928a..411641f5 100644 --- a/llm_on_ray/finetune/data_process.py +++ b/llm_on_ray/finetune/data_process.py @@ -183,38 +183,22 @@ def tokenize(self, examples): if len(keys) != 2: raise ValueError("Unsupported dataset format") - examples["input_ids"] = [] - examples["labels"] = [] - examples["attention_mask"] = [] + zip_examples = [] for s, t in zip(examples[keys[0]], examples[keys[1]]): - results = self.tokenizer( - s + t, - padding=self.padding, - truncation=self.truncation, - return_tensors=None, - max_length=self.max_length, - ) - - input_ids = results["input_ids"] - input_len = len(input_ids) - labels = copy.deepcopy(input_ids) - if self.mask_input or self.mask_response: - sources_tokenized = self.tokenizer( - s, - padding=False, - truncation=True, - return_tensors=None, - max_length=self.max_length, - ) - input_id_len = len(sources_tokenized["input_ids"]) + zip_examples.append(s + t) + + tokenized_examples = self.tokenizer(zip_examples, padding=self.padding, truncation=self.truncation, + return_tensors=None, max_length=self.max_length) + tokenized_examples["labels"] = copy.deepcopy(tokenized_examples["input_ids"]) + + if self.mask_input or self.mask_response: + tokenized_sources = self.tokenizer(examples[keys[0]], padding=False, truncation=self.truncation, + return_tensors=None, max_length=self.max_length) + for idx in range(len(tokenized_examples["input_ids"])): + len1 = len(tokenized_examples["input_ids"][idx]) + len2 = len(tokenized_sources["input_ids"][idx]) # mask input - if self.mask_input: - labels[:input_id_len] = [IGNORE_INDEX] * input_id_len + tokenized_examples["labels"][idx][:len2] = [IGNORE_INDEX] * len2 # mask response - if self.mask_response: - labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) - - examples["input_ids"].append(results["input_ids"]) - examples["labels"].append(labels) - examples["attention_mask"].append(results["attention_mask"]) - return examples + tokenized_examples["labels"][idx][len2:len1] = [IGNORE_INDEX] * (len1 - len2) + return tokenized_examples From 40cc4d29c126e081c51e9f2a1076ae7cb7e23344 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Fri, 5 Jul 2024 09:26:57 +0000 Subject: [PATCH 02/12] update --- llm_on_ray/finetune/finetune.py | 20 ++++++++++++++++---- llm_on_ray/finetune/finetune_config.py | 3 +++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 8c67dcb4..3ffd8784 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -116,13 +116,13 @@ def convert_to_training_args(cls, config: Dict): # set attr for FSDP # if accelerate_mode == "FSDP": - # args.updatwe({}) + # args.update({}) # set attr for Intel Gaudi if device == "hpu": args.update({"use_habana": True}) args.update({"use_lazy_mode": config["Training"]["hpu_execution_mode"] == "lazy"}) - args.update({"pipelining_fwd_bwd": True}) + args.update({"pipelining_fwd_bwd": config["Training"]["pipelining_fwd_bwd"]}) return cls(**args) @@ -274,12 +274,24 @@ def load_model(config: Dict): peft_config = LoraConfig(**lora_config) model = get_peft_model(model, peft_config) - egc = config["General"].get("enable_gradient_checkpointing", False) - if egc: + if config["General"].get("enable_gradient_checkpointing", False): model.enable_input_require_grads() model.gradient_checkpointing_enable() model.config.use_cache = False + if model.config.model_type == "llama": + model.generation_config.pad_token_id = 0 + model.generation_config.bos_token_id = 1 + model.generation_config.eos_token_id = 2 + attn_softmax_bf16 = config["General"]["attn_softmax_bf16"] + if attn_softmax_bf16: + model.generation_config.attn_softmax_bf16 + use_flash_attention = config["General"]["use_flash_attention"] + if use_flash_attention: + model.generation_config.use_flash_attention = True + model.generation_config.flash_attention_recompute = False + model.generation_config.flash_attention_causal_mask = False + model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"])) return model diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 27bbe3cd..2dd54949 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -53,6 +53,8 @@ class General(BaseModel): report_to: str = "none" resume_from_checkpoint: Optional[str] = None save_strategy: str = "no" + attn_softmax_bf16: bool = False + use_flash_attention: bool = False config: GeneralConfig lora_config: Optional[LoraConfig] = None enable_gradient_checkpointing: bool = False @@ -104,6 +106,7 @@ class Training(BaseModel): mixed_precision: str = PRECISION_NO gradient_accumulation_steps: int = 1 logging_steps: int = 10 + pipelining_fwd_bwd: bool = False deepspeed_config_file: str = "" @validator("device") From a151d7b3ddb0babc84909e49f480c619d762b472 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Fri, 5 Jul 2024 09:27:34 +0000 Subject: [PATCH 03/12] update --- llm_on_ray/finetune/data_process.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/llm_on_ray/finetune/data_process.py b/llm_on_ray/finetune/data_process.py index 411641f5..aea2ba96 100644 --- a/llm_on_ray/finetune/data_process.py +++ b/llm_on_ray/finetune/data_process.py @@ -187,13 +187,23 @@ def tokenize(self, examples): for s, t in zip(examples[keys[0]], examples[keys[1]]): zip_examples.append(s + t) - tokenized_examples = self.tokenizer(zip_examples, padding=self.padding, truncation=self.truncation, - return_tensors=None, max_length=self.max_length) + tokenized_examples = self.tokenizer( + zip_examples, + padding=self.padding, + truncation=self.truncation, + return_tensors=None, + max_length=self.max_length, + ) tokenized_examples["labels"] = copy.deepcopy(tokenized_examples["input_ids"]) if self.mask_input or self.mask_response: - tokenized_sources = self.tokenizer(examples[keys[0]], padding=False, truncation=self.truncation, - return_tensors=None, max_length=self.max_length) + tokenized_sources = self.tokenizer( + examples[keys[0]], + padding=False, + truncation=self.truncation, + return_tensors=None, + max_length=self.max_length, + ) for idx in range(len(tokenized_examples["input_ids"])): len1 = len(tokenized_examples["input_ids"][idx]) len2 = len(tokenized_sources["input_ids"][idx]) From f886ae7441d7c94f7830ea51110ba8ccb1fc5547 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 8 Jul 2024 15:55:24 +0000 Subject: [PATCH 04/12] update --- llm_on_ray/finetune/finetune.py | 39 +++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 3ffd8784..5af3c86c 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -68,8 +68,17 @@ def set_seed(config: Dict): _set_seed(seed) -def convert_to_training_args(cls, config: Dict): +def prepare_training_args(config: Dict): device = config["Training"]["device"] + if device == "hpu": + from optimum.habana.transformers import GaudiTrainingArguments + + cls = GaudiTrainingArguments + else: + from transformers import TrainingArguments + + cls = TrainingArguments + accelerate_mode = config["Training"]["accelerate_mode"] save_strategy = config["General"]["save_strategy"] @@ -279,30 +288,34 @@ def load_model(config: Dict): model.gradient_checkpointing_enable() model.config.use_cache = False + device = config["Training"]["device"] if model.config.model_type == "llama": model.generation_config.pad_token_id = 0 model.generation_config.bos_token_id = 1 model.generation_config.eos_token_id = 2 attn_softmax_bf16 = config["General"]["attn_softmax_bf16"] - if attn_softmax_bf16: + if attn_softmax_bf16 and device == "hpu": model.generation_config.attn_softmax_bf16 use_flash_attention = config["General"]["use_flash_attention"] - if use_flash_attention: + if use_flash_attention and device == "hpu": model.generation_config.use_flash_attention = True model.generation_config.flash_attention_recompute = False model.generation_config.flash_attention_causal_mask = False - model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"])) + # model.to(dtype=model_dtype, device=torch.device(device)) return model -def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator): +def get_trainer(config: Dict, training_args, model, tokenizer, tokenized_dataset): + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8 + ) + device = config["Training"]["device"] if device in ["cpu", "gpu"]: - from transformers import Trainer, TrainingArguments + from transformers import Trainer - training_args = convert_to_training_args(TrainingArguments, config) trainer = Trainer( model=model, args=training_args, @@ -313,10 +326,9 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator tokenizer=tokenizer, data_collator=data_collator, ) - return training_args, trainer + return trainer elif device in ["hpu"]: from optimum.habana.transformers import GaudiTrainer - from optimum.habana.transformers import GaudiTrainingArguments from optimum.habana import GaudiConfig # If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), otherwise use default gaudi_config @@ -328,7 +340,6 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator gaudi_config.use_fused_adam = True gaudi_config.use_fused_clip_norm = True - training_args = convert_to_training_args(GaudiTrainingArguments, config) trainer = GaudiTrainer( model=model, args=training_args, @@ -340,7 +351,7 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator tokenizer=tokenizer, data_collator=data_collator, ) - return training_args, trainer + return trainer return None @@ -351,17 +362,17 @@ def train_func(config: Dict[str, Any]): set_seed(config) + training_args = prepare_training_args(config) + tokenizer = load_tokenizer(config) dataset = load_dataset(config) tokenized_dataset = tokenize_dataset(config, tokenizer, dataset) - data_collator = prepare_data_collator(config, tokenizer) - model = load_model(config) - training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator) + trainer = get_trainer(config, training_args, model, tokenizer, tokenized_dataset) common.logger.info("train start") trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) From 0adcb182cd5732b123c563f6346152188a64545f Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 8 Jul 2024 16:03:38 +0000 Subject: [PATCH 05/12] update --- llm_on_ray/finetune/finetune_config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 2dd54949..96fe0323 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -81,7 +81,12 @@ class Dataset(BaseModel): padding: bool = True mask_input: bool = True mask_response: bool = True - data_preprocess_type: str = "neural_chat" + data_preprocess_type: str = "default" + + @validator("data_preprocess_type") + def check_data_preprocess_type(cls, v: str): + assert v in ["default", "neural_chat"] + return v class RayResourceConfig(BaseModel): From a426945ee58a2e0d9e6af3f1cede43e1f510d06e Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 8 Jul 2024 16:30:59 +0000 Subject: [PATCH 06/12] update --- llm_on_ray/finetune/finetune.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 5af3c86c..84b7700b 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -374,10 +374,23 @@ def train_func(config: Dict[str, Any]): trainer = get_trainer(config, training_args, model, tokenizer, tokenized_dataset) - common.logger.info("train start") - trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - trainer.save_model() - common.logger.info("train finish") + if training_args.do_train: + common.logger.info("train start") + results = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + + metrics = results.metrics + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + common.logger.info("train finish") + + if training_args.do_eval: + common.logger.info("eval start") + metrics = trainer.evaluate() + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + common.logger.info("eval finish") def get_finetune_config(): From d4d2402418ad9b1cf5c4ad6d597f20e9b5b8b513 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 8 Jul 2024 18:10:37 +0000 Subject: [PATCH 07/12] update --- llm_on_ray/finetune/finetune.py | 24 ++++++++++++++++++++++++ llm_on_ray/finetune/finetune_config.py | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 84b7700b..8f7fee20 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -28,6 +28,7 @@ import torch import datasets +import evaluate import transformers from peft import get_peft_model, LoraConfig @@ -340,6 +341,25 @@ def get_trainer(config: Dict, training_args, model, tokenizer, tokenized_dataset gaudi_config.use_fused_adam = True gaudi_config.use_fused_clip_norm = True + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + result = logits.argmax(dim=-1) + return result + + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics but we need to shift the labels + labels = labels[:, 1:].reshape(-1) + preds = preds[:, :-1].reshape(-1) + result = metric.compute(predictions=preds, references=labels) + return result + trainer = GaudiTrainer( model=model, args=training_args, @@ -350,6 +370,10 @@ def get_trainer(config: Dict, training_args, model, tokenizer, tokenized_dataset else None, tokenizer=tokenizer, data_collator=data_collator, + compute_metrics=compute_metrics if training_args.do_eval else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval + else None, ) return trainer return None diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 96fe0323..c19bda34 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -78,7 +78,7 @@ class Dataset(BaseModel): truncation_side: str = "right" max_seq_length: int = 512 truncation: bool = True - padding: bool = True + padding: bool = False mask_input: bool = True mask_response: bool = True data_preprocess_type: str = "default" From 25fff7746d464d4a7f43dd5bfb1a4d29c6d3beb5 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 9 Jul 2024 21:29:46 +0000 Subject: [PATCH 08/12] update --- llm_on_ray/finetune/finetune.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 8f7fee20..0a329773 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -54,10 +54,7 @@ def adapt_transformers_to_device(config: Dict): adapt_transformers_to_gaudi() -def set_seed(config: Dict): - seed = config["Training"].get("seed", None) - if seed is None: - return +def set_seed(config: Dict, seed): device = config["Training"]["device"] if device in ["cpu", "gpu"]: from accelerate.utils import set_seed as _set_seed @@ -265,12 +262,6 @@ def group_texts(examples): return tokenized_dataset -def prepare_data_collator(config: Dict, tokenizer): - return transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8 - ) - - def load_model(config: Dict): model_name = config["General"]["base_model"] model_dtype = convert_dtype(config["Training"].get("mixed_precision", "no")) @@ -332,7 +323,8 @@ def get_trainer(config: Dict, training_args, model, tokenizer, tokenized_dataset from optimum.habana.transformers import GaudiTrainer from optimum.habana import GaudiConfig - # If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), otherwise use default gaudi_config + # If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), + # otherwise use default gaudi_config gaudi_config_name = config["General"].get("gaudi_config_name", None) if gaudi_config_name is not None: gaudi_config = GaudiConfig.from_pretrained(gaudi_config_name) @@ -346,8 +338,7 @@ def preprocess_logits_for_metrics(logits, labels): # Depending on the model and config, logits may contain extra tensors, # like past_key_values, but logits always come first logits = logits[0] - result = logits.argmax(dim=-1) - return result + return logits.argmax(dim=-1) metric = evaluate.load("accuracy") @@ -357,8 +348,7 @@ def compute_metrics(eval_preds): # by preprocess_logits_for_metrics but we need to shift the labels labels = labels[:, 1:].reshape(-1) preds = preds[:, :-1].reshape(-1) - result = metric.compute(predictions=preds, references=labels) - return result + return metric.compute(predictions=preds, references=labels) trainer = GaudiTrainer( model=model, @@ -384,10 +374,10 @@ def train_func(config: Dict[str, Any]): adapt_transformers_to_device(config) - set_seed(config) - training_args = prepare_training_args(config) + set_seed(config, training_args.seed) + tokenizer = load_tokenizer(config) dataset = load_dataset(config) From ff803200d132fbfbad86964715cf85037afe55fa Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 9 Jul 2024 21:33:06 +0000 Subject: [PATCH 09/12] update --- dev/scripts/merge_lora_adaptor.py | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 dev/scripts/merge_lora_adaptor.py diff --git a/dev/scripts/merge_lora_adaptor.py b/dev/scripts/merge_lora_adaptor.py new file mode 100644 index 00000000..e4cc41c4 --- /dev/null +++ b/dev/scripts/merge_lora_adaptor.py @@ -0,0 +1,53 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import torch +from peft import PeftModel +from transformers import AutoTokenizer, AutoModelForCausalLM +from tqdm import tqdm + + +def apply_lora(base_model_path, lora_path): + print(f"Loading the base model from {base_model_path}") + base_tokenizer = AutoTokenizer.from_pretrained(base_model_path) + base = AutoModelForCausalLM.from_pretrained( + base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True + ) + + print(f"Loading the LoRA adapter from {lora_path}") + + lora_model = PeftModel.from_pretrained( + base, + lora_path, + ) + + print("Applying the LoRA") + model = lora_model.merge_and_unload() + + return base, model, base_tokenizer + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--base-model-path", type=str, required=True) + parser.add_argument("--lora-model-path", type=str, required=True) + parser.add_argument("--output-path", type=str, required=True) + + args = parser.parse_args() + + base, target, base_tokenizer = apply_lora(args.base_model_path, args.lora_model_path) + target.save_pretrained(args.output_path) + base_tokenizer.save_pretrained(args.output_path) From faa9e3584bb10089b678bb8b31e9cc9b0ff9b2f2 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Wed, 10 Jul 2024 13:52:47 +0000 Subject: [PATCH 10/12] update --- llm_on_ray/finetune/finetune.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 0a329773..0e6ff330 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -128,8 +128,15 @@ def prepare_training_args(config: Dict): # set attr for Intel Gaudi if device == "hpu": args.update({"use_habana": True}) - args.update({"use_lazy_mode": config["Training"]["hpu_execution_mode"] == "lazy"}) args.update({"pipelining_fwd_bwd": config["Training"]["pipelining_fwd_bwd"]}) + hpu_execution_mode = config["Training"]["hpu_execution_mode"] + if hpu_execution_mode == "lazy": + args.update({"use_lazy_mode": True}) + else: + args.update({"use_lazy_mode": False}) + if hpu_execution_mode == "eager.compile": + args.update({"torch_compile": True}) + args.update({"torch_compile_backend": "hpu_backend"}) return cls(**args) @@ -448,18 +455,14 @@ def main(external_config=None): "accelerate_mode" ] = "DDP" # will use DDP to accelerate if no method specified - ccl_worker_count = 1 device = config["Training"]["device"] if device != "cpu": - ccl_worker_count = num_training_workers + pass if not ray.is_initialized(): runtime_env = { "env_vars": { "OMP_NUM_THREADS": str(resources_per_worker["CPU"]), - "CCL_ZE_IPC_EXCHANGE": "sockets", - "CCL_WORKER_COUNT": str(ccl_worker_count), - "CCL_LOG_LEVEL": "info", "FI_TCP_IFACE": "lo", "FI_PROVIDER": "tcp", } From 85e9b8186479d46b4c68dd48e2c380aee8bb866f Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Thu, 11 Jul 2024 15:59:47 +0000 Subject: [PATCH 11/12] update --- llm_on_ray/common/logging.py | 2 +- llm_on_ray/finetune/finetune.py | 20 +++++++++++++++++--- llm_on_ray/finetune/finetune_config.py | 4 ++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/llm_on_ray/common/logging.py b/llm_on_ray/common/logging.py index c4b33c3d..1acad64d 100644 --- a/llm_on_ray/common/logging.py +++ b/llm_on_ray/common/logging.py @@ -27,7 +27,7 @@ logging_config = { "version": 1, "loggers": { - "root": {"level": "INFO", "handlers": ["consoleHandler"]}, + # "root": {"level": "INFO", "handlers": ["consoleHandler"]}, "common": { "level": "INFO", "handlers": ["consoleHandler"], diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 0e6ff330..4d742700 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -99,6 +99,18 @@ def prepare_training_args(config: Dict): "do_train": True, } + adam_epsilon = config["Training"]["adam_epsilon"] + if adam_epsilon is not None: + args.update({"adam_epsilon": adam_epsilon}) + + warmup_ratio = config["Training"]["warmup_ratio"] + if warmup_ratio is not None: + args.update({"warmup_ratio": warmup_ratio}) + + max_grad_norm = config["Training"]["max_grad_norm"] + if max_grad_norm is not None: + args.update({"max_grad_norm": max_grad_norm}) + # set attr do_eval vf = config["Dataset"].get("validation_file", None) vsp = config["Dataset"].get("validation_split_percentage", 0) @@ -129,6 +141,7 @@ def prepare_training_args(config: Dict): if device == "hpu": args.update({"use_habana": True}) args.update({"pipelining_fwd_bwd": config["Training"]["pipelining_fwd_bwd"]}) + hpu_execution_mode = config["Training"]["hpu_execution_mode"] if hpu_execution_mode == "lazy": args.update({"use_lazy_mode": True}) @@ -138,6 +151,10 @@ def prepare_training_args(config: Dict): args.update({"torch_compile": True}) args.update({"torch_compile_backend": "hpu_backend"}) + throughput_warmup_steps = config["Training"]["throughput_warmup_steps"] + if throughput_warmup_steps is not None: + args.update({"throughput_warmup_steps": throughput_warmup_steps}) + return cls(**args) @@ -456,13 +473,10 @@ def main(external_config=None): ] = "DDP" # will use DDP to accelerate if no method specified device = config["Training"]["device"] - if device != "cpu": - pass if not ray.is_initialized(): runtime_env = { "env_vars": { - "OMP_NUM_THREADS": str(resources_per_worker["CPU"]), "FI_TCP_IFACE": "lo", "FI_PROVIDER": "tcp", } diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index c19bda34..6f39af97 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -112,6 +112,10 @@ class Training(BaseModel): gradient_accumulation_steps: int = 1 logging_steps: int = 10 pipelining_fwd_bwd: bool = False + adam_epsilon: Optional[float] = None + throughput_warmup_steps: Optional[int] = None + warmup_ratio: Optional[float] = None + max_grad_norm: Optional[float] = None deepspeed_config_file: str = "" @validator("device") From fd4d31f182708f63c2188b4ce9f1b105a3262758 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Fri, 12 Jul 2024 09:46:12 +0000 Subject: [PATCH 12/12] update --- llm_on_ray/finetune/finetune.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 4d742700..6c9aa71f 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -309,11 +309,9 @@ def load_model(config: Dict): model.generation_config.pad_token_id = 0 model.generation_config.bos_token_id = 1 model.generation_config.eos_token_id = 2 - attn_softmax_bf16 = config["General"]["attn_softmax_bf16"] - if attn_softmax_bf16 and device == "hpu": - model.generation_config.attn_softmax_bf16 - use_flash_attention = config["General"]["use_flash_attention"] - if use_flash_attention and device == "hpu": + if device == "hpu" and config["General"]["attn_softmax_bf16"]: + model.generation_config.attn_softmax_bf16 = True + if device == "hpu" and config["General"]["use_flash_attention"]: model.generation_config.use_flash_attention = True model.generation_config.flash_attention_recompute = False model.generation_config.flash_attention_causal_mask = False