Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix fine-tuning bugs #265

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions dev/scripts/merge_lora_adaptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#
# Copyright 2023 The LLM-on-Ray Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm


def apply_lora(base_model_path, lora_path):
print(f"Loading the base model from {base_model_path}")
base_tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base = AutoModelForCausalLM.from_pretrained(
base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
)

print(f"Loading the LoRA adapter from {lora_path}")

lora_model = PeftModel.from_pretrained(
base,
lora_path,
)

print("Applying the LoRA")
model = lora_model.merge_and_unload()

return base, model, base_tokenizer


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--base-model-path", type=str, required=True)
parser.add_argument("--lora-model-path", type=str, required=True)
parser.add_argument("--output-path", type=str, required=True)

args = parser.parse_args()

base, target, base_tokenizer = apply_lora(args.base_model_path, args.lora_model_path)
target.save_pretrained(args.output_path)
base_tokenizer.save_pretrained(args.output_path)
2 changes: 1 addition & 1 deletion llm_on_ray/common/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
logging_config = {
"version": 1,
"loggers": {
"root": {"level": "INFO", "handlers": ["consoleHandler"]},
# "root": {"level": "INFO", "handlers": ["consoleHandler"]},
"common": {
"level": "INFO",
"handlers": ["consoleHandler"],
Expand Down
50 changes: 22 additions & 28 deletions llm_on_ray/finetune/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,38 +183,32 @@ def tokenize(self, examples):
if len(keys) != 2:
raise ValueError("Unsupported dataset format")

examples["input_ids"] = []
examples["labels"] = []
examples["attention_mask"] = []
zip_examples = []
for s, t in zip(examples[keys[0]], examples[keys[1]]):
results = self.tokenizer(
s + t,
padding=self.padding,
zip_examples.append(s + t)

tokenized_examples = self.tokenizer(
zip_examples,
padding=self.padding,
truncation=self.truncation,
return_tensors=None,
max_length=self.max_length,
)
tokenized_examples["labels"] = copy.deepcopy(tokenized_examples["input_ids"])

if self.mask_input or self.mask_response:
tokenized_sources = self.tokenizer(
examples[keys[0]],
padding=False,
truncation=self.truncation,
return_tensors=None,
max_length=self.max_length,
)

input_ids = results["input_ids"]
input_len = len(input_ids)
labels = copy.deepcopy(input_ids)
if self.mask_input or self.mask_response:
sources_tokenized = self.tokenizer(
s,
padding=False,
truncation=True,
return_tensors=None,
max_length=self.max_length,
)
input_id_len = len(sources_tokenized["input_ids"])
for idx in range(len(tokenized_examples["input_ids"])):
len1 = len(tokenized_examples["input_ids"][idx])
len2 = len(tokenized_sources["input_ids"][idx])
# mask input
if self.mask_input:
labels[:input_id_len] = [IGNORE_INDEX] * input_id_len
tokenized_examples["labels"][idx][:len2] = [IGNORE_INDEX] * len2
# mask response
if self.mask_response:
labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len)

examples["input_ids"].append(results["input_ids"])
examples["labels"].append(labels)
examples["attention_mask"].append(results["attention_mask"])
return examples
tokenized_examples["labels"][idx][len2:len1] = [IGNORE_INDEX] * (len1 - len2)
return tokenized_examples
145 changes: 105 additions & 40 deletions llm_on_ray/finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import torch
import datasets
import evaluate
import transformers

from peft import get_peft_model, LoraConfig
Expand All @@ -53,10 +54,7 @@ def adapt_transformers_to_device(config: Dict):
adapt_transformers_to_gaudi()


def set_seed(config: Dict):
seed = config["Training"].get("seed", None)
if seed is None:
return
def set_seed(config: Dict, seed):
device = config["Training"]["device"]
if device in ["cpu", "gpu"]:
from accelerate.utils import set_seed as _set_seed
Expand All @@ -68,8 +66,17 @@ def set_seed(config: Dict):
_set_seed(seed)


def convert_to_training_args(cls, config: Dict):
def prepare_training_args(config: Dict):
device = config["Training"]["device"]
if device == "hpu":
from optimum.habana.transformers import GaudiTrainingArguments

cls = GaudiTrainingArguments
else:
from transformers import TrainingArguments

cls = TrainingArguments

accelerate_mode = config["Training"]["accelerate_mode"]
save_strategy = config["General"]["save_strategy"]

Expand All @@ -92,6 +99,18 @@ def convert_to_training_args(cls, config: Dict):
"do_train": True,
}

adam_epsilon = config["Training"]["adam_epsilon"]
if adam_epsilon is not None:
args.update({"adam_epsilon": adam_epsilon})

warmup_ratio = config["Training"]["warmup_ratio"]
if warmup_ratio is not None:
args.update({"warmup_ratio": warmup_ratio})

max_grad_norm = config["Training"]["max_grad_norm"]
if max_grad_norm is not None:
args.update({"max_grad_norm": max_grad_norm})

# set attr do_eval
vf = config["Dataset"].get("validation_file", None)
vsp = config["Dataset"].get("validation_split_percentage", 0)
Expand All @@ -116,13 +135,25 @@ def convert_to_training_args(cls, config: Dict):

# set attr for FSDP
# if accelerate_mode == "FSDP":
# args.updatwe({})
# args.update({})

# set attr for Intel Gaudi
if device == "hpu":
args.update({"use_habana": True})
args.update({"use_lazy_mode": config["Training"]["hpu_execution_mode"] == "lazy"})
args.update({"pipelining_fwd_bwd": True})
args.update({"pipelining_fwd_bwd": config["Training"]["pipelining_fwd_bwd"]})

hpu_execution_mode = config["Training"]["hpu_execution_mode"]
if hpu_execution_mode == "lazy":
args.update({"use_lazy_mode": True})
else:
args.update({"use_lazy_mode": False})
if hpu_execution_mode == "eager.compile":
args.update({"torch_compile": True})
args.update({"torch_compile_backend": "hpu_backend"})

throughput_warmup_steps = config["Training"]["throughput_warmup_steps"]
if throughput_warmup_steps is not None:
args.update({"throughput_warmup_steps": throughput_warmup_steps})

return cls(**args)

Expand Down Expand Up @@ -255,12 +286,6 @@ def group_texts(examples):
return tokenized_dataset


def prepare_data_collator(config: Dict, tokenizer):
return transformers.DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)


def load_model(config: Dict):
model_name = config["General"]["base_model"]
model_dtype = convert_dtype(config["Training"].get("mixed_precision", "no"))
Expand All @@ -274,23 +299,37 @@ def load_model(config: Dict):
peft_config = LoraConfig(**lora_config)
model = get_peft_model(model, peft_config)

egc = config["General"].get("enable_gradient_checkpointing", False)
if egc:
if config["General"].get("enable_gradient_checkpointing", False):
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.config.use_cache = False

model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"]))
device = config["Training"]["device"]
if model.config.model_type == "llama":
model.generation_config.pad_token_id = 0
model.generation_config.bos_token_id = 1
model.generation_config.eos_token_id = 2
if device == "hpu" and config["General"]["attn_softmax_bf16"]:
model.generation_config.attn_softmax_bf16 = True
if device == "hpu" and config["General"]["use_flash_attention"]:
model.generation_config.use_flash_attention = True
model.generation_config.flash_attention_recompute = False
model.generation_config.flash_attention_causal_mask = False

# model.to(dtype=model_dtype, device=torch.device(device))

return model


def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator):
def get_trainer(config: Dict, training_args, model, tokenizer, tokenized_dataset):
data_collator = transformers.DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)

device = config["Training"]["device"]
if device in ["cpu", "gpu"]:
from transformers import Trainer, TrainingArguments
from transformers import Trainer

training_args = convert_to_training_args(TrainingArguments, config)
trainer = Trainer(
model=model,
args=training_args,
Expand All @@ -301,13 +340,13 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator
tokenizer=tokenizer,
data_collator=data_collator,
)
return training_args, trainer
return trainer
elif device in ["hpu"]:
from optimum.habana.transformers import GaudiTrainer
from optimum.habana.transformers import GaudiTrainingArguments
from optimum.habana import GaudiConfig

# If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), otherwise use default gaudi_config
# If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana),
# otherwise use default gaudi_config
gaudi_config_name = config["General"].get("gaudi_config_name", None)
if gaudi_config_name is not None:
gaudi_config = GaudiConfig.from_pretrained(gaudi_config_name)
Expand All @@ -316,7 +355,23 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator
gaudi_config.use_fused_adam = True
gaudi_config.use_fused_clip_norm = True

training_args = convert_to_training_args(GaudiTrainingArguments, config)
def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
# like past_key_values, but logits always come first
logits = logits[0]
return logits.argmax(dim=-1)

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
# preds have the same shape as the labels, after the argmax(-1) has been calculated
# by preprocess_logits_for_metrics but we need to shift the labels
labels = labels[:, 1:].reshape(-1)
preds = preds[:, :-1].reshape(-1)
return metric.compute(predictions=preds, references=labels)

trainer = GaudiTrainer(
model=model,
args=training_args,
Expand All @@ -327,8 +382,12 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator
else None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics if training_args.do_eval else None,
preprocess_logits_for_metrics=preprocess_logits_for_metrics
if training_args.do_eval
else None,
)
return training_args, trainer
return trainer
return None


Expand All @@ -337,24 +396,37 @@ def train_func(config: Dict[str, Any]):

adapt_transformers_to_device(config)

set_seed(config)
training_args = prepare_training_args(config)

set_seed(config, training_args.seed)

tokenizer = load_tokenizer(config)

dataset = load_dataset(config)

tokenized_dataset = tokenize_dataset(config, tokenizer, dataset)

data_collator = prepare_data_collator(config, tokenizer)

model = load_model(config)

training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)
trainer = get_trainer(config, training_args, model, tokenizer, tokenized_dataset)

if training_args.do_train:
common.logger.info("train start")
results = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()

metrics = results.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
common.logger.info("train finish")

if training_args.do_eval:
common.logger.info("eval start")
metrics = trainer.evaluate()

common.logger.info("train start")
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()
common.logger.info("train finish")
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
common.logger.info("eval finish")


def get_finetune_config():
Expand Down Expand Up @@ -398,18 +470,11 @@ def main(external_config=None):
"accelerate_mode"
] = "DDP" # will use DDP to accelerate if no method specified

ccl_worker_count = 1
device = config["Training"]["device"]
if device != "cpu":
ccl_worker_count = num_training_workers

if not ray.is_initialized():
runtime_env = {
"env_vars": {
"OMP_NUM_THREADS": str(resources_per_worker["CPU"]),
"CCL_ZE_IPC_EXCHANGE": "sockets",
"CCL_WORKER_COUNT": str(ccl_worker_count),
"CCL_LOG_LEVEL": "info",
"FI_TCP_IFACE": "lo",
"FI_PROVIDER": "tcp",
}
Expand Down
Loading
Loading