From 90819066b498da3c807357627a1bfcf5d40306d0 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 1 Apr 2024 02:29:30 +0000 Subject: [PATCH 01/24] implement fine-tuning chat template function Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 91 +++++-------------- llm_on_ray/finetune/finetune.py | 3 + llm_on_ray/finetune/finetune.yaml | 2 + llm_on_ray/finetune/finetune_config.py | 9 +- 4 files changed, 34 insertions(+), 71 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 37235b425..7c95b92ab 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -23,53 +23,9 @@ from llm_on_ray.common.dataprocesser import DataProcesser -INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." -INSTRUCTION_KEY = "### Instruction:" -INPUT_KEY = "Input:" RESPONSE_KEY = "### Response:" -END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" -PROMPT_NO_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{response_key} -{response} - -{end_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - response_key=RESPONSE_KEY, - response="{response}", - end_key=END_KEY, -) - -PROMPT_WITH_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{input_key} -{input} - -{response_key} -{response} - -{end_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - input_key=INPUT_KEY, - input="{input}", - response_key=RESPONSE_KEY, - response="{response}", - end_key=END_KEY, -) -TEXT_COLUMN_NAME = "text" - class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling): def torch_call(self, examples): @@ -101,6 +57,7 @@ def torch_call(self, examples): class GeneralProcesser(DataProcesser): def tokenize_dataset(self, tokenizer, dataset): max_length = self.config.get("max_length") + custom_chat_template = self.config.get("custom_chat_template") group = self.config.get("group") block_size = self.config.get("block_size") tokenizer.pad_token = tokenizer.eos_token @@ -111,35 +68,29 @@ def tokenize_dataset(self, tokenizer, dataset): if isinstance(dataset, datasets.DatasetDict): column_names = dataset["train"].column_names - if column_names and TEXT_COLUMN_NAME not in column_names: - - def prompt(rec): - instruction = rec["instruction"] - response = rec["response"] - context = rec.get("context") - if not instruction: - raise ValueError(f"Expected an instruction in: {rec}") - if not response: - raise ValueError(f"Expected a response in: {rec}") - if context: - rec["text"] = PROMPT_WITH_INPUT_FORMAT.format( - instruction=instruction, response=response, input=context + def tokenize_function(examples): + if self.config.get("is_base_model"): + if custom_chat_template: + new_tokenizer = tokenizer.apply_chat_template( + examples, + chat_template=custom_chat_template, + tokenize=True, + max_length=max_length, ) else: - rec["text"] = PROMPT_NO_INPUT_FORMAT.format( - instruction=instruction, response=response + new_tokenizer = tokenizer.apply_chat_template( + examples, + chat_template=self.config.get("default_chat_template"), + tokenize=True, + max_length=max_length, ) - return rec - - dataset = dataset.map( - prompt, - load_from_cache_file=False, - desc="Prompt", - ) - column_names += [TEXT_COLUMN_NAME] - - def tokenize_function(examples): - return tokenizer(examples[TEXT_COLUMN_NAME], max_length=max_length) + else: + new_tokenizer = tokenizer.apply_chat_template( + examples, tokenize=False, max_length=max_length + ) + print(new_tokenizer) + print(new_tokenizer.default_chat_template) + return new_tokenizer tokenized_datasets = dataset.map( tokenize_function, diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 29d955a49..444b7649d 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -230,6 +230,9 @@ def train_func(config: Dict[str, Any]): "group": config["Dataset"].get("group", True), "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), + "name": tokenizer_name, + "config": config["General"]["config"], + "custom_chat_template": config["General"]["custom_chat_template"], } ) tokenized_datasets = dataprocesser.tokenize_dataset(tokenizer, datasets) diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 627a88753..3baf110c3 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -1,5 +1,6 @@ General: base_model: EleutherAI/gpt-j-6b + is_base_model: false gpt_base_model: true output_dir: /tmp/llm-ray/output save_strategy: no @@ -12,6 +13,7 @@ General: lora_alpha: 32 lora_dropout: 0.1 enable_gradient_checkpointing: false + custom_chat_template: null Dataset: train_file: examples/data/sample_finetune_data_small.jsonl group: true diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 030fcc5a6..bbc814d28 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -17,7 +17,6 @@ from pydantic import BaseModel, validator from typing import Optional, List - PRECISION_BF16 = "bf16" PRECISION_FP16 = "fp16" PRECISION_NO = "no" @@ -61,6 +60,14 @@ class General(BaseModel): lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False + custom_chat_template: Optional[str] = None + default_chat_template: str = ( + "{{'### Below is an instruction that describes a task. " + "Write a response that appropriately completes the request. \n'}}" + "{% for message in messages %}{{'### Instruction: ' + message['instruction'] " + "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] " + "+ '### End \n'}}{% endfor %}" + ) class Dataset(BaseModel): From 7f7d404d805d686d9af7bfdbeb9153eae4be92ca Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 2 Apr 2024 08:25:34 +0000 Subject: [PATCH 02/24] update Signed-off-by: minmingzhu --- llm_on_ray/common/trainer/default_trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/common/trainer/default_trainer.py b/llm_on_ray/common/trainer/default_trainer.py index 8825f08be..5509bc3a1 100644 --- a/llm_on_ray/common/trainer/default_trainer.py +++ b/llm_on_ray/common/trainer/default_trainer.py @@ -33,6 +33,7 @@ class DefaultTrainer(Trainer): def __init__(self, config): self.model = None + self.tokenizer = None self.config = config dataprocesser_config = config.get("dataprocesser") dataprocesser_type = dataprocesser_config.get("type") @@ -121,7 +122,7 @@ def _get_lr_scheduler( def prepare(self, model, tokenizer, dataset, optimizer, accelerator): self._coordinate(accelerator) - + self.tokenizer = tokenizer embedding_size = model.get_input_embeddings().weight.shape[0] logger.info(f"model embedding size: {embedding_size}") if len(tokenizer) > embedding_size: @@ -290,6 +291,11 @@ def train(self): is_main_process=self.accelerator.is_main_process, save_function=self.accelerator.save, ) + self.tokenizer.save_pretrained( + output, + is_main_process=self.accelerator.is_main_process, + save_function=self.accelerator.save, + ) logger.info(f"finish save model to {output}") self.accelerator.wait_for_everyone() From a3ce22feeca6ae7025307707a104022af66ef224 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 05:41:44 +0000 Subject: [PATCH 03/24] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 7c95b92ab..bffebbd68 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -58,6 +58,8 @@ class GeneralProcesser(DataProcesser): def tokenize_dataset(self, tokenizer, dataset): max_length = self.config.get("max_length") custom_chat_template = self.config.get("custom_chat_template") + model_default_chat_template = self.config.get("model_default_chat_template") + group = self.config.get("group") block_size = self.config.get("block_size") tokenizer.pad_token = tokenizer.eos_token @@ -71,25 +73,44 @@ def tokenize_dataset(self, tokenizer, dataset): def tokenize_function(examples): if self.config.get("is_base_model"): if custom_chat_template: - new_tokenizer = tokenizer.apply_chat_template( + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( examples, - chat_template=custom_chat_template, - tokenize=True, + tokenize=False, max_length=max_length, ) else: + tokenizer.chat_template = self.config.get("default_chat_template") new_tokenizer = tokenizer.apply_chat_template( examples, - chat_template=self.config.get("default_chat_template"), - tokenize=True, + tokenize=False, max_length=max_length, ) else: - new_tokenizer = tokenizer.apply_chat_template( - examples, tokenize=False, max_length=max_length - ) - print(new_tokenizer) - print(new_tokenizer.default_chat_template) + if model_default_chat_template: + tokenizer.chat_template = model_default_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + new_messages = [ + { + "role": "user", + "content": "instruction: " + + examples["instruction"] + + " context: " + + examples["context"], + }, + {"role": "assistant", "content": "response: " + examples["response"]}, + ] + + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) return new_tokenizer tokenized_datasets = dataset.map( From b10cda384d0f5387bb7e9c8078c698c29923d87f Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 05:59:44 +0000 Subject: [PATCH 04/24] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 45 +++++++++++++++++++++++++- llm_on_ray/finetune/finetune_config.py | 11 ++++--- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 444b7649d..cc6018442 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -232,7 +232,6 @@ def train_func(config: Dict[str, Any]): "shuffle": config["Dataset"].get("shuffle", False), "name": tokenizer_name, "config": config["General"]["config"], - "custom_chat_template": config["General"]["custom_chat_template"], } ) tokenized_datasets = dataprocesser.tokenize_dataset(tokenizer, datasets) @@ -284,6 +283,50 @@ def train_func(config: Dict[str, Any]): tokenizer=tokenizer, data_collator=data_collator, ) + trainer = common.trainer.Trainer.registory.get("DefaultTrainer")( + config={ + "device": config["Training"]["device"], + "accelerate_mode": config["Training"]["accelerate_mode"], + "num_train_epochs": epochs, + "max_train_steps": config["Training"].get("max_train_steps", None), + "logging_steps": config["Training"].get("logging_steps", 1), + "output": output_dir, + "dataprocesser": { + "type": "GeneralProcesser", + "per_device_train_batch_size": config["Training"]["batch_size"], + "per_device_eval_batch_size": config["Training"]["batch_size"], + "preprocessing_num_workers": config["Dataset"].get("preprocessing_num_workers", 1), + "max_length": config["Dataset"].get("max_length", 512), + "group": config["Dataset"].get("group", True), + "block_size": config["Dataset"].get("block_size", 512), + "shuffle": config["Dataset"].get("shuffle", False), + "is_base_model": config["General"]["is_base_model"], + "custom_chat_template": config["General"]["custom_chat_template"], + "default_chat_template": config["General"]["default_chat_template"], + "model_default_chat_template": config["General"]["model_default_chat_template"], + }, + "lr_scheduler": { + "enable": True, + "max_train_steps": None, + "lr_scheduler_type": config["Training"]["lr_scheduler"], + "num_warmup_steps": 0, + "learning_rate": config["Training"]["learning_rate"], + "weight_decay": config["Training"]["weight_decay"], + }, + "checkpoint": { + "root_path": config["General"].get("checkpoint_dir", None), + }, + } + ) + + try: + common.logger.info("trainer prepare start") + model.training = True + trainer.prepare(model, tokenizer, datasets, optimizer, accelerator) + except Exception as e: + common.logger.critical(e, exc_info=True) + exit(1) + common.logger.info("trainer prepare finish") common.logger.info("train start") trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index bbc814d28..e5cd5dcf7 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -62,12 +62,13 @@ class General(BaseModel): enable_gradient_checkpointing: bool = False custom_chat_template: Optional[str] = None default_chat_template: str = ( - "{{'### Below is an instruction that describes a task. " - "Write a response that appropriately completes the request. \n'}}" - "{% for message in messages %}{{'### Instruction: ' + message['instruction'] " - "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] " - "+ '### End \n'}}{% endfor %}" + "{{'### Below is an instruction that describes a task." + "Write a response that appropriately completes the request. '}}" + "{{'### Instruction: ' + messages['instruction'] " + "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] " + "+ '### End \n'}}" ) + model_default_chat_template: Optional[str] = None class Dataset(BaseModel): From 049304ab8abcfb04765ebc597580e454ccc50fa6 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 06:18:40 +0000 Subject: [PATCH 05/24] integrate gbt for transformer 4.26.0 Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 61 +++++++++++++++++++ llm_on_ray/finetune/finetune.py | 1 + 2 files changed, 62 insertions(+) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index bffebbd68..9ac21f979 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -23,9 +23,53 @@ from llm_on_ray.common.dataprocesser import DataProcesser +INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." +INSTRUCTION_KEY = "### Instruction:" +INPUT_KEY = "Input:" RESPONSE_KEY = "### Response:" +END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" +PROMPT_NO_INPUT_FORMAT = """{intro} + +{instruction_key} +{instruction} + +{response_key} +{response} + +{end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, +) + +PROMPT_WITH_INPUT_FORMAT = """{intro} + +{instruction_key} +{instruction} + +{input_key} +{input} + +{response_key} +{response} + +{end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + input_key=INPUT_KEY, + input="{input}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, +) +TEXT_COLUMN_NAME = "text" + class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling): def torch_call(self, examples): @@ -71,6 +115,23 @@ def tokenize_dataset(self, tokenizer, dataset): column_names = dataset["train"].column_names def tokenize_function(examples): + if self.config.get("gpt_base_model"): + instruction = examples["instruction"] + response = examples["response"] + context = examples.get("context") + if not instruction: + raise ValueError(f"Expected an instruction in: {examples}") + if not response: + raise ValueError(f"Expected a response in: {examples}") + if context: + examples["text"] = PROMPT_WITH_INPUT_FORMAT.format( + instruction=instruction, response=response, input=context + ) + else: + examples["text"] = PROMPT_NO_INPUT_FORMAT.format( + instruction=instruction, response=response + ) + return tokenizer(examples["text"], max_length=max_length, truncation=True) if self.config.get("is_base_model"): if custom_chat_template: tokenizer.chat_template = custom_chat_template diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index cc6018442..fdd079838 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -300,6 +300,7 @@ def train_func(config: Dict[str, Any]): "group": config["Dataset"].get("group", True), "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), + "gpt_base_model": config["General"].get("gpt_base_model", False), "is_base_model": config["General"]["is_base_model"], "custom_chat_template": config["General"]["custom_chat_template"], "default_chat_template": config["General"]["default_chat_template"], From 63a121749073c32a749181c0f26f0cec20f6f411 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 06:40:23 +0000 Subject: [PATCH 06/24] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 85 ++++++++++--------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 9ac21f979..df539835b 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -124,55 +124,56 @@ def tokenize_function(examples): if not response: raise ValueError(f"Expected a response in: {examples}") if context: - examples["text"] = PROMPT_WITH_INPUT_FORMAT.format( + new_message = PROMPT_WITH_INPUT_FORMAT.format( instruction=instruction, response=response, input=context ) else: - examples["text"] = PROMPT_NO_INPUT_FORMAT.format( + new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(examples["text"], max_length=max_length, truncation=True) - if self.config.get("is_base_model"): - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("default_chat_template") - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + return tokenizer.tokenize(new_message, max_length=max_length) else: - if model_default_chat_template: - tokenizer.chat_template = model_default_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + if self.config.get("is_base_model"): + if custom_chat_template: + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + tokenizer.chat_template = self.config.get("default_chat_template") + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) else: - new_messages = [ - { - "role": "user", - "content": "instruction: " - + examples["instruction"] - + " context: " - + examples["context"], - }, - {"role": "assistant", "content": "response: " + examples["response"]}, - ] - - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - return new_tokenizer + if model_default_chat_template: + tokenizer.chat_template = model_default_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + new_messages = [ + { + "role": "user", + "content": "instruction: " + + examples["instruction"] + + " context: " + + examples["context"], + }, + {"role": "assistant", "content": "response: " + examples["response"]}, + ] + + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) + return new_tokenizer tokenized_datasets = dataset.map( tokenize_function, From 58c95847a5ad0f842d1c8800b417037b6394216a Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 08:56:23 +0000 Subject: [PATCH 07/24] update Signed-off-by: minmingzhu --- llm_on_ray/common/dataprocesser/general_processer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index df539835b..f752dc9ac 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -131,7 +131,7 @@ def tokenize_function(examples): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer.tokenize(new_message, max_length=max_length) + return tokenizer(new_message, max_length=max_length) else: if self.config.get("is_base_model"): if custom_chat_template: @@ -173,7 +173,7 @@ def tokenize_function(examples): tokenize=False, max_length=max_length, ) - return new_tokenizer + return tokenizer(new_tokenizer, max_length=max_length) tokenized_datasets = dataset.map( tokenize_function, From e2193cab144164305c4ed30ab4d9f9707ac99de0 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 9 Apr 2024 07:10:41 +0000 Subject: [PATCH 08/24] 1. remove is_base_model tag 2. modify chat template Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 72 ++++++++----------- llm_on_ray/finetune/finetune.py | 4 +- llm_on_ray/finetune/finetune.yaml | 1 - llm_on_ray/finetune/finetune_config.py | 22 ++++-- llm_on_ray/finetune/models/mpt-7b.yaml | 1 + 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index f752dc9ac..3c39eb429 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -102,7 +102,6 @@ class GeneralProcesser(DataProcesser): def tokenize_dataset(self, tokenizer, dataset): max_length = self.config.get("max_length") custom_chat_template = self.config.get("custom_chat_template") - model_default_chat_template = self.config.get("model_default_chat_template") group = self.config.get("group") block_size = self.config.get("block_size") @@ -133,46 +132,37 @@ def tokenize_function(examples): ) return tokenizer(new_message, max_length=max_length) else: - if self.config.get("is_base_model"): - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("default_chat_template") - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + new_messages = [ + { + "role": "user", + "content": INTRO_BLURB + "\n\n" + + "###Instruction:\n" + + examples["instruction"] + "\n\n" + + "###context:\n" + + examples["context"] + "\n\n", + }, + {"role": "assistant", "content": examples["response"]}, + ] + if custom_chat_template: + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) + elif tokenizer.chat_template is not None: + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) else: - if model_default_chat_template: - tokenizer.chat_template = model_default_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - new_messages = [ - { - "role": "user", - "content": "instruction: " - + examples["instruction"] - + " context: " - + examples["context"], - }, - {"role": "assistant", "content": "response: " + examples["response"]}, - ] - - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) + tokenizer.chat_template = self.config.get("chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) return tokenizer(new_tokenizer, max_length=max_length) tokenized_datasets = dataset.map( @@ -194,7 +184,7 @@ def group_texts(examples): total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + k: [t[i: i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index fdd079838..5bcff4ad9 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -301,10 +301,8 @@ def train_func(config: Dict[str, Any]): "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), "gpt_base_model": config["General"].get("gpt_base_model", False), - "is_base_model": config["General"]["is_base_model"], "custom_chat_template": config["General"]["custom_chat_template"], - "default_chat_template": config["General"]["default_chat_template"], - "model_default_chat_template": config["General"]["model_default_chat_template"], + "chat_template": config["General"]["chat_template"], }, "lr_scheduler": { "enable": True, diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 3baf110c3..15b38501f 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -1,6 +1,5 @@ General: base_model: EleutherAI/gpt-j-6b - is_base_model: false gpt_base_model: true output_dir: /tmp/llm-ray/output save_strategy: no diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index e5cd5dcf7..11ae93ce0 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -61,14 +61,22 @@ class General(BaseModel): deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False custom_chat_template: Optional[str] = None - default_chat_template: str = ( - "{{'### Below is an instruction that describes a task." - "Write a response that appropriately completes the request. '}}" - "{{'### Instruction: ' + messages['instruction'] " - "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] " - "+ '### End \n'}}" + chat_template: Optional[str] = ( + "{{ bos_token }}" + "{% if messages[0]['role'] == 'system' %}" + "{{ raise_exception('System role not supported') }}" + "{% endif %}" + "{% for message in messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if message['role'] == 'user' %}" + "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Response:' + message['content'] + eos_token }}" + "{% endif %}{% endfor %}" + "{{'### End \n'}}" ) - model_default_chat_template: Optional[str] = None class Dataset(BaseModel): diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml index ef2efa006..8e719f186 100644 --- a/llm_on_ray/finetune/models/mpt-7b.yaml +++ b/llm_on_ray/finetune/models/mpt-7b.yaml @@ -1,6 +1,7 @@ General: base_model: mosaicml/mpt-7b tokenizer_name: EleutherAI/gpt-neox-20b + is_base_model: false gpt_base_model: false output_dir: /tmp/llm-ray/output save_strategy: no From 1090bf0ec5168bbf06e4d5fa0057cbfc81dcc693 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 01:25:29 +0000 Subject: [PATCH 09/24] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 118 +++++++++--------- pyproject.toml | 3 +- 2 files changed, 64 insertions(+), 57 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 3c39eb429..5782fecd9 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -99,9 +99,67 @@ def torch_call(self, examples): class GeneralProcesser(DataProcesser): - def tokenize_dataset(self, tokenizer, dataset): - max_length = self.config.get("max_length") - custom_chat_template = self.config.get("custom_chat_template") + def tokenize_function(self, examples, tokenizer): + print(examples) + if self.config.get("gpt_base_model"): + instruction = examples["instruction"] + response = examples["response"] + context = examples.get("context") + if not instruction: + raise ValueError(f"Expected an instruction in: {examples}") + if not response: + raise ValueError(f"Expected a response in: {examples}") + if context: + new_message = PROMPT_WITH_INPUT_FORMAT.format( + instruction=instruction, response=response, input=context + ) + else: + new_message = PROMPT_NO_INPUT_FORMAT.format( + instruction=instruction, response=response + ) + return tokenizer(new_message, max_length=self.config.get("max_length")) + else: + new_messages = [ + { + "role": "user", + "content": "###Instruction:\n" + + examples["instruction"] + "\n\n" + + "###context:\n" + + examples["context"] + "\n\n", + }, + {"role": "assistant", "content": examples["response"] + "\n\n"}, + ] + print(new_messages) + if self.config.get("custom_chat_template") is not None: + print("custom_chat_template") + tokenizer.chat_template = self.config.get("custom_chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + elif tokenizer.chat_template is not None: + print("tokenizer.chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + else: + print("chat_template") + tokenizer.chat_template = self.config.get("chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length")) + print(tokenizer) + return tokenizer + + def prepare(self, tokenizer, dataset): + per_device_train_batch_size = self.config.get("per_device_train_batch_size") + per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") group = self.config.get("group") block_size = self.config.get("block_size") @@ -113,60 +171,8 @@ def tokenize_dataset(self, tokenizer, dataset): if isinstance(dataset, datasets.DatasetDict): column_names = dataset["train"].column_names - def tokenize_function(examples): - if self.config.get("gpt_base_model"): - instruction = examples["instruction"] - response = examples["response"] - context = examples.get("context") - if not instruction: - raise ValueError(f"Expected an instruction in: {examples}") - if not response: - raise ValueError(f"Expected a response in: {examples}") - if context: - new_message = PROMPT_WITH_INPUT_FORMAT.format( - instruction=instruction, response=response, input=context - ) - else: - new_message = PROMPT_NO_INPUT_FORMAT.format( - instruction=instruction, response=response - ) - return tokenizer(new_message, max_length=max_length) - else: - new_messages = [ - { - "role": "user", - "content": INTRO_BLURB + "\n\n" - + "###Instruction:\n" - + examples["instruction"] + "\n\n" - + "###context:\n" - + examples["context"] + "\n\n", - }, - {"role": "assistant", "content": examples["response"]}, - ] - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - elif tokenizer.chat_template is not None: - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("chat_template") - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - return tokenizer(new_tokenizer, max_length=max_length) - tokenized_datasets = dataset.map( - tokenize_function, + lambda examples: self.tokenize_function(examples, tokenizer), remove_columns=column_names, load_from_cache_file=False, desc="Tokenize dataset", diff --git a/pyproject.toml b/pyproject.toml index b319045cc..a18574675 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "py-cpuinfo", "pydantic-yaml", "async_timeout", - "typer" + "typer", + "jinja2>=3.0.0" ] [project.optional-dependencies] From 6bdd664eab764fb20025e210aed206fead3cb302 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:32:22 +0000 Subject: [PATCH 10/24] 1. update doc/finetune_parameters.md 2. add unit test Signed-off-by: minmingzhu --- docs/finetune_parameters.md | 2 + .../common/dataprocesser/general_processer.py | 15 +- tests/finetune/test_chat_template.py | 139 ++++++++++++++++++ 3 files changed, 145 insertions(+), 11 deletions(-) create mode 100644 tests/finetune/test_chat_template.py diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 4f113e69f..d80b6e46d 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -16,6 +16,8 @@ The following are the parameters supported in the finetuning workflow. |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| +|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:' + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.| +|custom_chat_template|None|User-defined chat template.| ## Dataset Parameters diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 5782fecd9..64b0c75d3 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -100,7 +100,6 @@ def torch_call(self, examples): class GeneralProcesser(DataProcesser): def tokenize_function(self, examples, tokenizer): - print(examples) if self.config.get("gpt_base_model"): instruction = examples["instruction"] response = examples["response"] @@ -117,7 +116,7 @@ def tokenize_function(self, examples, tokenizer): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(new_message, max_length=self.config.get("max_length")) + return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length")) else: new_messages = [ { @@ -129,32 +128,26 @@ def tokenize_function(self, examples, tokenizer): }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] - print(new_messages) if self.config.get("custom_chat_template") is not None: - print("custom_chat_template") tokenizer.chat_template = self.config.get("custom_chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) elif tokenizer.chat_template is not None: - print("tokenizer.chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) else: - print("chat_template") tokenizer.chat_template = self.config.get("chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) - tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length")) - print(tokenizer) + tokenizer = tokenizer(new_tokenizer, + add_special_tokens=False, + max_length=self.config.get("max_length")) return tokenizer def prepare(self, tokenizer, dataset): diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py new file mode 100644 index 000000000..7cdda115c --- /dev/null +++ b/tests/finetune/test_chat_template.py @@ -0,0 +1,139 @@ +import unittest + +import transformers +from transformers import AutoTokenizer +from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser + + +class TestTokenizeFunction(unittest.TestCase): + def setUp(self): + self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') + self.config = { + 'gpt_base_model': True, + 'max_length': 512, + 'trust_remote_code': False, + 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " + "End \n'}}", + } + self.processer = GeneralProcesser(self.config) + + def test_tokenize_function_with_gpt_model(self): + self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') + + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = 'Below is an instruction that describes a task. Write a response that '\ + 'appropriately completes the request.\n'\ + '\n'\ + '### Instruction:\n'\ + 'Test instruction\n'\ + '\n'\ + 'Input:\n'\ + 'Test context\n'\ + '\n'\ + '### Response:\n'\ + 'Test response\n'\ + '\n'\ + '### End' + + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_custom_chat_template(self): + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = '<|im_start|>user\n' \ + '###Instruction:\n' \ + 'Test instruction\n' \ + '\n' \ + '###context:\n' \ + 'Test context\n' \ + '\n' \ + '<|im_end|><|im_start|>assistant\n' \ + 'Test response\n' \ + '\n' \ + '<|im_end|>' + # Set custom chat template + self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ + "+ message['content'] + '<|im_end|>'}}{% endfor %}" + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_chat_template(self): + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = 'Below is an instruction that describes a task. Write a response that '\ + 'appropriately completes the request\n'\ + '### Instruction: ###Instruction:\n'\ + 'Test instruction\n'\ + '\n'\ + '###context:\n'\ + 'Test context\n'\ + '\n'\ + '### Response: Test response\n'\ + '\n'\ + '### End \n'\ + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_default_chat_template(self): + self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + chat_example = [ + { + "role": "user", + "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", + + }, + { + "role": "assistant", + "content": "Test response\n\n", + } + ] + + # Verify the format of the result + expected_result = self.tokenizer.apply_chat_template(chat_example, + tokenize=False, + max_length=self.config.get("max_length")) + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + +if __name__ == '__main__': + unittest.main() From 4f0d118bb131a1cc64026525d4b88f33bdadfc48 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:44:21 +0000 Subject: [PATCH 11/24] update Signed-off-by: minmingzhu --- docs/finetune_parameters.md | 3 +- .../common/dataprocesser/general_processer.py | 6 +- llm_on_ray/finetune/finetune.py | 55 ++++--------------- llm_on_ray/finetune/finetune_config.py | 4 +- 4 files changed, 16 insertions(+), 52 deletions(-) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index d80b6e46d..cebb1449e 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -16,8 +16,7 @@ The following are the parameters supported in the finetuning workflow. |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| -|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:' + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.| -|custom_chat_template|None|User-defined chat template.| +|chat_template|None|User-defined chat template.| ## Dataset Parameters diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 64b0c75d3..636ec006c 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -128,8 +128,8 @@ def tokenize_function(self, examples, tokenizer): }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] - if self.config.get("custom_chat_template") is not None: - tokenizer.chat_template = self.config.get("custom_chat_template") + if self.config.get("chat_template") is not None: + tokenizer.chat_template = self.config.get("chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, @@ -140,7 +140,7 @@ def tokenize_function(self, examples, tokenizer): tokenize=False, ) else: - tokenizer.chat_template = self.config.get("chat_template") + tokenizer.chat_template = self.config.get("default_chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 5bcff4ad9..85a678553 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -14,7 +14,7 @@ # limitations under the License. # -#!/usr/bin/env python +# !/usr/bin/env python import os import argparse @@ -283,49 +283,6 @@ def train_func(config: Dict[str, Any]): tokenizer=tokenizer, data_collator=data_collator, ) - trainer = common.trainer.Trainer.registory.get("DefaultTrainer")( - config={ - "device": config["Training"]["device"], - "accelerate_mode": config["Training"]["accelerate_mode"], - "num_train_epochs": epochs, - "max_train_steps": config["Training"].get("max_train_steps", None), - "logging_steps": config["Training"].get("logging_steps", 1), - "output": output_dir, - "dataprocesser": { - "type": "GeneralProcesser", - "per_device_train_batch_size": config["Training"]["batch_size"], - "per_device_eval_batch_size": config["Training"]["batch_size"], - "preprocessing_num_workers": config["Dataset"].get("preprocessing_num_workers", 1), - "max_length": config["Dataset"].get("max_length", 512), - "group": config["Dataset"].get("group", True), - "block_size": config["Dataset"].get("block_size", 512), - "shuffle": config["Dataset"].get("shuffle", False), - "gpt_base_model": config["General"].get("gpt_base_model", False), - "custom_chat_template": config["General"]["custom_chat_template"], - "chat_template": config["General"]["chat_template"], - }, - "lr_scheduler": { - "enable": True, - "max_train_steps": None, - "lr_scheduler_type": config["Training"]["lr_scheduler"], - "num_warmup_steps": 0, - "learning_rate": config["Training"]["learning_rate"], - "weight_decay": config["Training"]["weight_decay"], - }, - "checkpoint": { - "root_path": config["General"].get("checkpoint_dir", None), - }, - } - ) - - try: - common.logger.info("trainer prepare start") - model.training = True - trainer.prepare(model, tokenizer, datasets, optimizer, accelerator) - except Exception as e: - common.logger.critical(e, exc_info=True) - exit(1) - common.logger.info("trainer prepare finish") common.logger.info("train start") trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) @@ -401,7 +358,15 @@ def main(external_config=None): ) # additional 1 for head worker ray.init(num_cpus=num_cpus, runtime_env=runtime_env) else: - ray.init(runtime_env=runtime_env) + import intel_extension_for_pytorch as ipex + + if "xpu" in ipex.__version__: + num_cpus = ( + resources_per_worker["CPU"] * num_training_workers + 1 + ) # additional 1 for head worker + ray.init(num_cpus=num_cpus, runtime_env=runtime_env) + else: + ray.init(runtime_env=runtime_env) common.logger.info(f"ray available resources = {ray.available_resources()}") use_gpu = True if device == "gpu" else False diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 11ae93ce0..bbbb916af 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -60,8 +60,8 @@ class General(BaseModel): lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False - custom_chat_template: Optional[str] = None - chat_template: Optional[str] = ( + chat_template: Optional[str] = None + default_chat_template: str = ( "{{ bos_token }}" "{% if messages[0]['role'] == 'system' %}" "{{ raise_exception('System role not supported') }}" From e08a93c18cbeca8227c6d3d7fba38d01fc7372c3 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Tue, 9 Apr 2024 09:19:24 +0800 Subject: [PATCH 12/24] Support latest Ray 2.10 release (#158) * update * fix blocking * update Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * fix setup and getting started Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * nit Signed-off-by: Wu, Xiaochang * Add dependencies for tests and update pyproject.toml Signed-off-by: Wu, Xiaochang * Update dependencies and test workflow Signed-off-by: Wu, Xiaochang * Update dependencies and fix torch_dist.py Signed-off-by: Wu, Xiaochang * Update OpenAI SDK installation and start ray cluster Signed-off-by: Wu, Xiaochang --------- Signed-off-by: Wu, Xiaochang --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a18574675..451d2649d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,9 +34,9 @@ dependencies = [ "deltatuner==1.1.9", "py-cpuinfo", "pydantic-yaml", - "async_timeout", - "typer", - "jinja2>=3.0.0" + "async-timeout", + "jinja2>=3.0.0", + "typer" ] [project.optional-dependencies] From 1bbaf2285683a48e879a9e7aee196fabd8ee6ad7 Mon Sep 17 00:00:00 2001 From: yutianchen Date: Tue, 9 Apr 2024 15:38:35 +0800 Subject: [PATCH 13/24] [Tests] Add query single test (#156) * single test * single test * single test * single test * fix hang error --- tests/inference/test_query_single.py | 107 +++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/inference/test_query_single.py diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py new file mode 100644 index 000000000..1c32f6b73 --- /dev/null +++ b/tests/inference/test_query_single.py @@ -0,0 +1,107 @@ +import subprocess +import pytest +import os + +os.environ["no_proxy"] = "localhost,127.0.0.1" + + +def start_serve(model_name): + current_path = os.path.dirname(os.path.abspath(__file__)) + + config_path = os.path.join( + current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml" + ) + + cmd_serve = ["llm_on_ray-serve", "--config_file", config_path, "--simple"] + + result_serve = subprocess.run(cmd_serve, capture_output=True, text=True) + + # Ensure there are no errors in the serve script execution + assert result_serve.returncode == 0, print( + "\n" + "Serve error stderr message: " + "\n", result_serve.stderr + ) + + # Print the output of subprocess.run for checking if output is expected + print("\n" + "Serve message: " + "\n", result_serve.stdout) + + # Ensure there are no errors in the serve script execution + assert "Error" not in result_serve.stderr + + +def script_with_args( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k +): + current_path = os.path.dirname(os.path.abspath(__file__)) + + os.path.join(current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml") + + example_query_single_path = os.path.join( + current_path, "../../examples/inference/api_server_simple/query_single.py" + ) + + cmd_single = [ + "python", + example_query_single_path, + "--model_endpoint", + base_url + model_name, + ] + + if streaming_response: + cmd_single.append("--streaming_response") + + if max_new_tokens is not None: + cmd_single.extend(["--max_new_tokens", str(max_new_tokens)]) + + if temperature is not None: + cmd_single.extend(["--temperature", str(temperature)]) + + if top_p is not None: + cmd_single.extend(["--top_p", str(top_p)]) + + if top_k is not None: + cmd_single.extend(["--top_k", str(top_k)]) + + result_query_single = subprocess.run(cmd_single, capture_output=True, text=True) + + # Print the output of subprocess.run for checking if output is expected + print(result_query_single) + + # Ensure there are no errors in the OpenAI API query script execution + assert "Error" not in result_query_single.stderr + + # Returncode should be 0 when there is no exception + assert result_query_single.returncode == 0 + + +executed_models = {} + + +# Parametrize the test function with different combinations of parameters +# TODO: more models and combinations will be added and tested. +@pytest.mark.parametrize( + "base_url,model_name,streaming_response,max_new_tokens,temperature,top_p, top_k", + [ + (base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k) + for base_url in ["http://localhost:8000/"] + for model_name in ["gpt2"] + for streaming_response in [None] + for max_new_tokens in [None] + for temperature in [None] + for top_p in [None] + for top_k in [None] + ], +) +def test_script( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k +): + global executed_models + + # Check if this modelname has already executed start_serve + if model_name not in executed_models: + start_serve(model_name) + # Mark this modelname has already executed start_serve + executed_models[model_name] = True + + script_with_args( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k + ) From 9498efe9dae2d6547d15d1fd551eda8f2ead0cd0 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:50:32 +0000 Subject: [PATCH 14/24] format Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 20 +- llm_on_ray/finetune/finetune.py | 2 +- tests/finetune/test_chat_template.py | 180 +++++++++--------- 3 files changed, 104 insertions(+), 98 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 636ec006c..1dc953d27 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -116,15 +116,19 @@ def tokenize_function(self, examples, tokenizer): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length")) + return tokenizer( + new_message, add_special_tokens=False, max_length=self.config.get("max_length") + ) else: new_messages = [ { "role": "user", "content": "###Instruction:\n" - + examples["instruction"] + "\n\n" - + "###context:\n" - + examples["context"] + "\n\n", + + examples["instruction"] + + "\n\n" + + "###context:\n" + + examples["context"] + + "\n\n", }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] @@ -145,9 +149,9 @@ def tokenize_function(self, examples, tokenizer): new_messages, tokenize=False, ) - tokenizer = tokenizer(new_tokenizer, - add_special_tokens=False, - max_length=self.config.get("max_length")) + tokenizer = tokenizer( + new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length") + ) return tokenizer def prepare(self, tokenizer, dataset): @@ -183,7 +187,7 @@ def group_texts(examples): total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { - k: [t[i: i + block_size] for i in range(0, total_length, block_size)] + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 85a678553..ae2e36c87 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -362,7 +362,7 @@ def main(external_config=None): if "xpu" in ipex.__version__: num_cpus = ( - resources_per_worker["CPU"] * num_training_workers + 1 + resources_per_worker["CPU"] * num_training_workers + 1 ) # additional 1 for head worker ray.init(num_cpus=num_cpus, runtime_env=runtime_env) else: diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index 7cdda115c..2270a5781 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -7,133 +7,135 @@ class TestTokenizeFunction(unittest.TestCase): def setUp(self): - self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") self.config = { - 'gpt_base_model': True, - 'max_length': 512, - 'trust_remote_code': False, - 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " - "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" - "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" - "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " - "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " - "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " - "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " - "End \n'}}", + "gpt_base_model": True, + "max_length": 512, + "trust_remote_code": False, + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " + "End \n'}}", } self.processer = GeneralProcesser(self.config) def test_tokenize_function_with_gpt_model(self): - self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') + self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = 'Below is an instruction that describes a task. Write a response that '\ - 'appropriately completes the request.\n'\ - '\n'\ - '### Instruction:\n'\ - 'Test instruction\n'\ - '\n'\ - 'Input:\n'\ - 'Test context\n'\ - '\n'\ - '### Response:\n'\ - 'Test response\n'\ - '\n'\ - '### End' + expected_result = ( + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request.\n" + "\n" + "### Instruction:\n" + "Test instruction\n" + "\n" + "Input:\n" + "Test context\n" + "\n" + "### Response:\n" + "Test response\n" + "\n" + "### End" + ) result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_custom_chat_template(self): - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = '<|im_start|>user\n' \ - '###Instruction:\n' \ - 'Test instruction\n' \ - '\n' \ - '###context:\n' \ - 'Test context\n' \ - '\n' \ - '<|im_end|><|im_start|>assistant\n' \ - 'Test response\n' \ - '\n' \ - '<|im_end|>' + expected_result = ( + "<|im_start|>user\n" + "###Instruction:\n" + "Test instruction\n" + "\n" + "###context:\n" + "Test context\n" + "\n" + "<|im_end|><|im_start|>assistant\n" + "Test response\n" + "\n" + "<|im_end|>" + ) # Set custom chat template - self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ - "+ message['content'] + '<|im_end|>'}}{% endfor %}" + self.config["custom_chat_template"] = ( + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" + "+ message['content'] + '<|im_end|>'}}{% endfor %}" + ) - self.config['gpt_base_model'] = False + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_chat_template(self): - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = 'Below is an instruction that describes a task. Write a response that '\ - 'appropriately completes the request\n'\ - '### Instruction: ###Instruction:\n'\ - 'Test instruction\n'\ - '\n'\ - '###context:\n'\ - 'Test context\n'\ - '\n'\ - '### Response: Test response\n'\ - '\n'\ - '### End \n'\ - - self.config['gpt_base_model'] = False + expected_result = ( + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request\n" + "### Instruction: ###Instruction:\n" + "Test instruction\n" + "\n" + "###context:\n" + "Test context\n" + "\n" + "### Response: Test response\n" + "\n" + "### End \n" + ) + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_default_chat_template(self): - self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } chat_example = [ { "role": "user", "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", - }, { "role": "assistant", "content": "Test response\n\n", - } + }, ] # Verify the format of the result - expected_result = self.tokenizer.apply_chat_template(chat_example, - tokenize=False, - max_length=self.config.get("max_length")) + expected_result = self.tokenizer.apply_chat_template( + chat_example, tokenize=False, max_length=self.config.get("max_length") + ) - self.config['gpt_base_model'] = False + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 115c513fb9b9648b374d102d6e5b23479da0be71 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:22:03 +0000 Subject: [PATCH 15/24] [Finetune] use base model mpt-7b instead of mpt-7b-chat (#181) * use base model mpt-7b instead of mpt-7b-chat Signed-off-by: minmingzhu * manual setting specify tokenizer Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update doc/finetune_parameters.md Signed-off-by: minmingzhu --------- Signed-off-by: minmingzhu --- llm_on_ray/finetune/models/mpt-7b.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml index 8e719f186..ef2efa006 100644 --- a/llm_on_ray/finetune/models/mpt-7b.yaml +++ b/llm_on_ray/finetune/models/mpt-7b.yaml @@ -1,7 +1,6 @@ General: base_model: mosaicml/mpt-7b tokenizer_name: EleutherAI/gpt-neox-20b - is_base_model: false gpt_base_model: false output_dir: /tmp/llm-ray/output save_strategy: no From cfa3064847b65ee88d302833aa23e5de8f8defa2 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 06:27:14 +0000 Subject: [PATCH 16/24] fix license issues Signed-off-by: minmingzhu --- tests/finetune/test_chat_template.py | 15 +++++++++++++++ tests/inference/test_query_single.py | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index 2270a5781..a416d8f7b 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -1,3 +1,18 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import unittest import transformers diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py index 1c32f6b73..d48727a30 100644 --- a/tests/inference/test_query_single.py +++ b/tests/inference/test_query_single.py @@ -1,3 +1,19 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import subprocess import pytest import os From c0e4d2d127c6e5df1716cebe338c5b249f55b783 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 22 Apr 2024 14:18:31 +0800 Subject: [PATCH 17/24] Update finetune.yaml --- llm_on_ray/finetune/finetune.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 15b38501f..627a88753 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -12,7 +12,6 @@ General: lora_alpha: 32 lora_dropout: 0.1 enable_gradient_checkpointing: false - custom_chat_template: null Dataset: train_file: examples/data/sample_finetune_data_small.jsonl group: true From b24c9f0188363c01650bda61d8fa0567da5bbd82 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 14 May 2024 14:08:15 +0800 Subject: [PATCH 18/24] refactor datap rocesser Signed-off-by: minmingzhu --- llm_on_ray/common/dataprocesser/__init__.py | 3 +- .../common/dataprocesser/general_processer.py | 126 +++++++++++++---- llm_on_ray/common/trainer/default_trainer.py | 8 +- tests/finetune/test_chat_template.py | 52 +++---- tests/finetune/test_slimOrca_chat_template.py | 128 ++++++++++++++++++ 5 files changed, 263 insertions(+), 54 deletions(-) create mode 100644 tests/finetune/test_slimOrca_chat_template.py diff --git a/llm_on_ray/common/dataprocesser/__init__.py b/llm_on_ray/common/dataprocesser/__init__.py index 2b5152764..c1bf68ae8 100644 --- a/llm_on_ray/common/dataprocesser/__init__.py +++ b/llm_on_ray/common/dataprocesser/__init__.py @@ -15,7 +15,8 @@ # from llm_on_ray.common.dataprocesser.dataprocesser import DataProcesser -from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser +from llm_on_ray.common.dataprocesser.general_processer import ChatDataPreprocess +from llm_on_ray.common.dataprocesser.general_processer import SlimOrcaDataPreprocess from llm_on_ray.common.dataprocesser.rm_dataprocesser import RMDataProcesser diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 1dc953d27..31094aa8b 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -24,9 +24,9 @@ from llm_on_ray.common.dataprocesser import DataProcesser INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." -INSTRUCTION_KEY = "### Instruction:" -INPUT_KEY = "Input:" -RESPONSE_KEY = "### Response:" +INSTRUCTION_KEY = "### Instruction: " +INPUT_KEY = "Input: " +RESPONSE_KEY = "### Response: " END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" @@ -70,6 +70,11 @@ ) TEXT_COLUMN_NAME = "text" +SLIMORCA_PROMPT_DICT = { + "prompt_with_input": ("### System: {system} \n" "### User: {user} \n### Assistant: {gpt}"), + "prompt_without_input": ("### System: {system} \n" "### Assistant: {gpt}"), +} + class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling): def torch_call(self, examples): @@ -98,8 +103,17 @@ def torch_call(self, examples): return batch -class GeneralProcesser(DataProcesser): - def tokenize_function(self, examples, tokenizer): +class ChatDataPreprocess(DataProcesser): + base_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n""" + + def __init__(self, config): + super().__init__(config) + self.prompt_template = self.base_template + self.user = "### Instruction:\n" + self.assistant = "### Response:\n" + self.end = "### End\n" + + def create_data(self, examples): if self.config.get("gpt_base_model"): instruction = examples["instruction"] response = examples["response"] @@ -109,50 +123,49 @@ def tokenize_function(self, examples, tokenizer): if not response: raise ValueError(f"Expected a response in: {examples}") if context: - new_message = PROMPT_WITH_INPUT_FORMAT.format( + new_messages = PROMPT_WITH_INPUT_FORMAT.format( instruction=instruction, response=response, input=context ) else: - new_message = PROMPT_NO_INPUT_FORMAT.format( + new_messages = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer( - new_message, add_special_tokens=False, max_length=self.config.get("max_length") - ) else: new_messages = [ { "role": "user", - "content": "###Instruction:\n" - + examples["instruction"] + "content": examples["instruction"] + "\n\n" - + "###context:\n" + + INPUT_KEY + examples["context"] + "\n\n", }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] + + return new_messages + + def tokenize_func(self, tokenizer, message): + if self.config.get("gpt_base_model"): + return tokenizer( + message, add_special_tokens=False, max_length=self.config.get("max_length") + ) + else: if self.config.get("chat_template") is not None: tokenizer.chat_template = self.config.get("chat_template") - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - ) elif tokenizer.chat_template is not None: - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - ) + pass else: tokenizer.chat_template = self.config.get("default_chat_template") - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - ) - tokenizer = tokenizer( + + new_tokenizer = tokenizer.apply_chat_template( + message, + tokenize=False, + ) + print(new_tokenizer) + return tokenizer( new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length") ) - return tokenizer def prepare(self, tokenizer, dataset): per_device_train_batch_size = self.config.get("per_device_train_batch_size") @@ -169,7 +182,7 @@ def prepare(self, tokenizer, dataset): column_names = dataset["train"].column_names tokenized_datasets = dataset.map( - lambda examples: self.tokenize_function(examples, tokenizer), + lambda examples: self.tokenize_func(tokenizer, self.create_data(examples)), remove_columns=column_names, load_from_cache_file=False, desc="Tokenize dataset", @@ -235,3 +248,60 @@ def prepare_dataloader(self, tokenizer, dataset): } eval_dataloader = torch.utils.data.DataLoader(eval_dataset, **eval_dataloader_params) return train_dataloader, eval_dataloader + + +class SlimOrcaDataPreprocess(ChatDataPreprocess): + chat_template = ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}" + "{{ '### System: ' + message['content'] }}" + "{% elif message['role'] == 'user' %}" + "{{ '### User: ' + message['content'] }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Assistant: ' + message['content'] }}" + "{% endif %}" + "{% endfor %}" + ) + + def __init__(self, config): + super().__init__(config) + self.config["chat_template"] = self.chat_template + self.default_system = "You are a helpful, respectful and honest assistant." + + def create_data(self, data): + examples = {} + conv = data["conversations"] + # system + if conv[0]["from"] != "system": + examples["system"] = self.default_system + start = 0 + elif conv[0]["from"] == "system" and conv[0]["value"] == "": + examples[conv[0]["from"]] = self.default_system + start = 1 + else: + examples[conv[0]["from"]] = conv[0]["value"] + start = 1 + + for j in range(start, len(conv) - 1, 2): + examples[conv[j]["from"]] = conv[j]["value"] + examples[conv[j + 1]["from"]] = conv[j + 1]["value"] + + new_messages = [ + {"role": "system", "content": examples["system"] + "\n"}, + { + "role": "user", + "content": examples["human"] + "\n", + }, + {"role": "assistant", "content": examples["gpt"] + "\n"}, + ] + if self.config.get("gpt_base_model"): + if examples["human"]: + return SLIMORCA_PROMPT_DICT["prompt_with_input"].format( + system=examples["system"], user=examples["human"], gpt=examples["gpt"] + ) + else: + return SLIMORCA_PROMPT_DICT["prompt_with_input"].format( + system=examples["human"], gpt=examples["gpt"] + ) + else: + return new_messages diff --git a/llm_on_ray/common/trainer/default_trainer.py b/llm_on_ray/common/trainer/default_trainer.py index 5509bc3a1..61d9d6015 100644 --- a/llm_on_ray/common/trainer/default_trainer.py +++ b/llm_on_ray/common/trainer/default_trainer.py @@ -37,7 +37,13 @@ def __init__(self, config): self.config = config dataprocesser_config = config.get("dataprocesser") dataprocesser_type = dataprocesser_config.get("type") - Factory = dataprocesser.DataProcesser.registory.get(dataprocesser_type) + if dataprocesser_type == "chat": + Factory = dataprocesser.DataProcesser.registory.get("ChatDataPreprocess") + elif dataprocesser_type == "SlimOrca": + Factory = dataprocesser.DataProcesser.registory.get("SlimOrcaDataPreprocess") + else: + raise ValueError(f"there is no {dataprocesser_type} dataprocesser.") + if Factory is None: raise ValueError(f"there is no {dataprocesser_type} dataprocesser.") self.dataprocesser = Factory(dataprocesser_config) diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index a416d8f7b..4d1217b6c 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -17,7 +17,7 @@ import transformers from transformers import AutoTokenizer -from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser +from llm_on_ray.common.dataprocesser.general_processer import ChatDataPreprocess class TestTokenizeFunction(unittest.TestCase): @@ -36,7 +36,7 @@ def setUp(self): "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " "End \n'}}", } - self.processer = GeneralProcesser(self.config) + self.processer = ChatDataPreprocess(self.config) def test_tokenize_function_with_gpt_model(self): self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") @@ -52,20 +52,23 @@ def test_tokenize_function_with_gpt_model(self): "Below is an instruction that describes a task. Write a response that " "appropriately completes the request.\n" "\n" - "### Instruction:\n" + "### Instruction: \n" "Test instruction\n" "\n" - "Input:\n" + "Input: \n" "Test context\n" "\n" - "### Response:\n" + "### Response: \n" "Test response\n" "\n" "### End" ) - result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) + print(self.processer.create_data(examples)) + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(examples)) + print(self.tokenizer.decode(result["input_ids"])) + + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) def test_tokenize_function_with_custom_chat_template(self): examples = { @@ -77,28 +80,30 @@ def test_tokenize_function_with_custom_chat_template(self): # Verify the format of the result expected_result = ( "<|im_start|>user\n" - "###Instruction:\n" "Test instruction\n" "\n" - "###context:\n" - "Test context\n" + "Input: Test context\n" "\n" "<|im_end|><|im_start|>assistant\n" "Test response\n" "\n" "<|im_end|>" ) + + print(expected_result) # Set custom chat template - self.config["custom_chat_template"] = ( + self.config["chat_template"] = ( "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" "+ message['content'] + '<|im_end|>'}}{% endfor %}" ) self.config["gpt_base_model"] = False - result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) + print(self.processer.create_data(examples)) + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(examples)) + print(self.tokenizer.decode(result["input_ids"])) + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) - def test_tokenize_function_with_chat_template(self): + def test_tokenize_function_with_default_chat_template(self): examples = { "instruction": "Test instruction", "response": "Test response", @@ -109,21 +114,19 @@ def test_tokenize_function_with_chat_template(self): expected_result = ( "Below is an instruction that describes a task. Write a response that " "appropriately completes the request\n" - "### Instruction: ###Instruction:\n" - "Test instruction\n" + "### Instruction: Test instruction\n" "\n" - "###context:\n" - "Test context\n" + "Input: Test context\n" "\n" "### Response: Test response\n" "\n" "### End \n" ) self.config["gpt_base_model"] = False - result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(examples)) + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) - def test_tokenize_function_with_default_chat_template(self): + def test_tokenize_function_with_tokenizer_chat_template(self): self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") examples = { "instruction": "Test instruction", @@ -134,7 +137,7 @@ def test_tokenize_function_with_default_chat_template(self): chat_example = [ { "role": "user", - "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", + "content": "Test instruction\n\nInput: Test context\n\n", }, { "role": "assistant", @@ -147,9 +150,10 @@ def test_tokenize_function_with_default_chat_template(self): chat_example, tokenize=False, max_length=self.config.get("max_length") ) + self.config["chat_template"] = None self.config["gpt_base_model"] = False - result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(examples)) + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) if __name__ == "__main__": diff --git a/tests/finetune/test_slimOrca_chat_template.py b/tests/finetune/test_slimOrca_chat_template.py new file mode 100644 index 000000000..059a316d1 --- /dev/null +++ b/tests/finetune/test_slimOrca_chat_template.py @@ -0,0 +1,128 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import transformers +from datasets import Dataset +from transformers import AutoTokenizer +from llm_on_ray.common.dataprocesser.general_processer import ( + ChatDataPreprocess, + SlimOrcaDataPreprocess, +) + + +class TestTokenizeFunction(unittest.TestCase): + def setUp(self): + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + self.config = { + "gpt_base_model": True, + "max_length": 512, + "trust_remote_code": False, + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " + "End \n'}}", + } + self.processer = SlimOrcaDataPreprocess(self.config) + examples = { + "conversations": [ + {"from": "system", "value": "Test system", "weight": None}, + {"from": "human", "value": "Test human", "weight": 0}, + {"from": "gpt", "value": "Test gpt.", "weight": 1}, + ] + } + + self.ds = Dataset.from_dict(examples) + + def test_tokenize_function_with_gpt_model(self): + self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") + + # Verify the format of the result + expected_result = ( + "### System: Test system \n" "### User: Test human \n" "### Assistant: Test gpt." + ) + + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(self.ds)) + + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) + + def test_tokenize_function_with_custom_chat_template(self): + # Verify the format of the result + expected_result = ( + "<|im_start|>system\n" + "Test system\n" + "<|im_end|><|im_start|>user\n" + "Test human\n" + "<|im_end|><|im_start|>assistant\n" + "Test gpt.\n" + "<|im_end|>" + ) + + print(expected_result) + # Set custom chat template + self.config["chat_template"] = ( + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" + "+ message['content'] + '<|im_end|>'}}{% endfor %}" + ) + + self.config["gpt_base_model"] = False + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(self.ds)) + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) + + def test_tokenize_function_with_default_chat_template(self): + # Verify the format of the result + expected_result = ( + "### System: Test system\n" "### User: Test human\n" "### Assistant: Test gpt.\n" + ) + self.config["gpt_base_model"] = False + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(self.ds)) + self.assertEqual(expected_result, self.tokenizer.decode(result["input_ids"])) + + def test_tokenize_function_with_tokenizer_chat_template(self): + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") + + chat_example = [ + { + "role": "system", + "content": "Test system\n", + }, + { + "role": "user", + "content": "Test human\n", + }, + { + "role": "assistant", + "content": "Test gpt.\n", + }, + ] + + # Verify the format of the result + expected_result = self.tokenizer.apply_chat_template( + chat_example, tokenize=True, max_length=self.config.get("max_length") + ) + + self.config["chat_template"] = None + self.config["gpt_base_model"] = False + result = self.processer.tokenize_func(self.tokenizer, self.processer.create_data(self.ds)) + self.assertEqual(expected_result, result["input_ids"]) + + +if __name__ == "__main__": + unittest.main() From f0d94d11f7835d3ffd31b536a0f94aeafe792244 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 14 May 2024 14:24:35 +0800 Subject: [PATCH 19/24] update --- llm_on_ray/finetune/finetune.yaml | 3 ++- llm_on_ray/finetune/finetune_config.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 627a88753..78a9e1c57 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -13,7 +13,8 @@ General: lora_dropout: 0.1 enable_gradient_checkpointing: false Dataset: - train_file: examples/data/sample_finetune_data_small.jsonl + type: "SlimOrca" + train_file: Open-Orca/SlimOrca group: true max_length: 512 block_size: 512 diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index bbbb916af..3046d96c3 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -80,6 +80,7 @@ class General(BaseModel): class Dataset(BaseModel): + type: str = "chat" train_file: str validation_file: Optional[str] validation_split_percentage: int From 6075c2c97dda09c38031e851797445bfd1c69763 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 14 May 2024 22:16:41 +0800 Subject: [PATCH 20/24] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 51 +++++++++---------- tests/finetune/test_chat_template.py | 28 +++++++--- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 31094aa8b..3cb32e778 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -132,15 +132,19 @@ def create_data(self, examples): ) else: new_messages = [ + { + "role": "system", + "content": INTRO_BLURB + "\n", + }, { "role": "user", "content": examples["instruction"] - + "\n\n" + + "\n" + INPUT_KEY + examples["context"] - + "\n\n", + + "\n", }, - {"role": "assistant", "content": examples["response"] + "\n\n"}, + {"role": "assistant", "content": examples["response"] + "\n"}, ] return new_messages @@ -162,7 +166,6 @@ def tokenize_func(self, tokenizer, message): message, tokenize=False, ) - print(new_tokenizer) return tokenizer( new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length") ) @@ -251,21 +254,9 @@ def prepare_dataloader(self, tokenizer, dataset): class SlimOrcaDataPreprocess(ChatDataPreprocess): - chat_template = ( - "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - "{{ '### System: ' + message['content'] }}" - "{% elif message['role'] == 'user' %}" - "{{ '### User: ' + message['content'] }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '### Assistant: ' + message['content'] }}" - "{% endif %}" - "{% endfor %}" - ) def __init__(self, config): super().__init__(config) - self.config["chat_template"] = self.chat_template self.default_system = "You are a helpful, respectful and honest assistant." def create_data(self, data): @@ -286,22 +277,26 @@ def create_data(self, data): examples[conv[j]["from"]] = conv[j]["value"] examples[conv[j + 1]["from"]] = conv[j + 1]["value"] - new_messages = [ - {"role": "system", "content": examples["system"] + "\n"}, - { - "role": "user", - "content": examples["human"] + "\n", - }, - {"role": "assistant", "content": examples["gpt"] + "\n"}, - ] if self.config.get("gpt_base_model"): if examples["human"]: - return SLIMORCA_PROMPT_DICT["prompt_with_input"].format( - system=examples["system"], user=examples["human"], gpt=examples["gpt"] + return PROMPT_WITH_INPUT_FORMAT.format( + instruction=examples["system"], response=examples["gpt"], input=examples["human"] ) else: - return SLIMORCA_PROMPT_DICT["prompt_with_input"].format( - system=examples["human"], gpt=examples["gpt"] + return PROMPT_NO_INPUT_FORMAT.format( + instruction=examples["system"], response=examples["gpt"] ) else: + new_messages = [ + {"role": "system", "content": INTRO_BLURB + "\n"}, + { + "role": "user", + "content": examples["system"] + + "\n" + + INPUT_KEY + + examples["human"] + + "\n", + }, + {"role": "assistant", "content": examples["gpt"] + "\n"}, + ] return new_messages diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index 4d1217b6c..31d0eed12 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -27,14 +27,26 @@ def setUp(self): "gpt_base_model": True, "max_length": 512, "trust_remote_code": False, - "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " - "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" - "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" - "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " - "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " - "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " - "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " - "End \n'}}", + "chat_template": "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" + "{% set system_message = messages[0]['content'] %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set system_message = false %}" + "{% endif %}" + "{% for message in loop_messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if loop.index0 == 0 and system_message %}" + "{{ system_message }}" + "{% endif %}" + "{% if message['role'] == 'user' %}" + "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Response:' + message['content'] + eos_token }}" + "{% endif %}{% endfor %}" + "{{'### End \n'}}", } self.processer = ChatDataPreprocess(self.config) From c17ce45af11f5dfd7d1c53ce578c4ab74f577106 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 17 May 2024 14:22:24 +0800 Subject: [PATCH 21/24] update Signed-off-by: minmingzhu --- llm_on_ray/common/__init__.py | 11 +++- .../common/dataprocesser/general_processer.py | 59 ++++++++++++------- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/llm_on_ray/common/__init__.py b/llm_on_ray/common/__init__.py index 0e8e821ad..e002976b6 100644 --- a/llm_on_ray/common/__init__.py +++ b/llm_on_ray/common/__init__.py @@ -18,4 +18,13 @@ from llm_on_ray.common.torch_config import TorchConfig from llm_on_ray.common.config import Config from llm_on_ray.common.init import init -from llm_on_ray.common import agentenv, dataset, initializer, model, optimizer, tokenizer, trainer +from llm_on_ray.common import ( + agentenv, + dataset, + initializer, + model, + optimizer, + tokenizer, + trainer, + dataprocesser, +) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 3cb32e778..6f8a9e1e2 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -27,7 +27,6 @@ INSTRUCTION_KEY = "### Instruction: " INPUT_KEY = "Input: " RESPONSE_KEY = "### Response: " -END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" PROMPT_NO_INPUT_FORMAT = """{intro} @@ -36,15 +35,12 @@ {instruction} {response_key} -{response} - -{end_key}""".format( +{response}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY, response="{response}", - end_key=END_KEY, ) PROMPT_WITH_INPUT_FORMAT = """{intro} @@ -56,9 +52,7 @@ {input} {response_key} -{response} - -{end_key}""".format( +{response}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", @@ -66,7 +60,6 @@ input="{input}", response_key=RESPONSE_KEY, response="{response}", - end_key=END_KEY, ) TEXT_COLUMN_NAME = "text" @@ -170,10 +163,7 @@ def tokenize_func(self, tokenizer, message): new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length") ) - def prepare(self, tokenizer, dataset): - per_device_train_batch_size = self.config.get("per_device_train_batch_size") - per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") - + def tokenize_dataset(self, tokenizer, dataset): group = self.config.get("group") block_size = self.config.get("block_size") tokenizer.pad_token = tokenizer.eos_token @@ -254,7 +244,6 @@ def prepare_dataloader(self, tokenizer, dataset): class SlimOrcaDataPreprocess(ChatDataPreprocess): - def __init__(self, config): super().__init__(config) self.default_system = "You are a helpful, respectful and honest assistant." @@ -280,7 +269,9 @@ def create_data(self, data): if self.config.get("gpt_base_model"): if examples["human"]: return PROMPT_WITH_INPUT_FORMAT.format( - instruction=examples["system"], response=examples["gpt"], input=examples["human"] + instruction=examples["system"], + response=examples["gpt"], + input=examples["human"], ) else: return PROMPT_NO_INPUT_FORMAT.format( @@ -291,12 +282,40 @@ def create_data(self, data): {"role": "system", "content": INTRO_BLURB + "\n"}, { "role": "user", - "content": examples["system"] - + "\n" - + INPUT_KEY - + examples["human"] - + "\n", + "content": examples["system"] + "\n" + INPUT_KEY + examples["human"] + "\n", }, {"role": "assistant", "content": examples["gpt"] + "\n"}, ] return new_messages + + +class OpenOrcaDataPreprocess(ChatDataPreprocess): + def __init__(self, config): + super().__init__(config) + self.default_system = "You are an AI assistant. You will be given a task. You must generate a detailed and long answer." + + def create_data(self, examples): + if self.config.get("gpt_base_model"): + if not examples["system"]: + examples["system"] = self.default_system + + if examples["question"]: + return PROMPT_WITH_INPUT_FORMAT.format( + instruction=examples["system"], + response=examples["chosen"], + input=examples["question"], + ) + else: + return PROMPT_NO_INPUT_FORMAT.format( + instruction=examples["system"], response=examples["chosen"] + ) + else: + new_messages = [ + {"role": "system", "content": INTRO_BLURB + "\n"}, + { + "role": "user", + "content": examples["system"] + "\n" + INPUT_KEY + examples["question"] + "\n", + }, + {"role": "assistant", "content": examples["chosen"] + "\n"}, + ] + return new_messages From 678d6e265c381c748a7a7db4a644df9e1cfcfbeb Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 17 May 2024 15:41:37 +0800 Subject: [PATCH 22/24] update --- llm_on_ray/finetune/finetune.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index ae2e36c87..a4ecd9b07 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -221,7 +221,17 @@ def train_func(config: Dict[str, Any]): } ) - dataprocesser = common.dataprocesser.DataProcesser.registory.get("GeneralProcesser")( + dataprocesser_type = config["Dataset"]["type"] + if dataprocesser_type == "chat": + preprocesser_name = "ChatDataPreprocess" + elif dataprocesser_type == "OpenOrca": + preprocesser_name = "OpenOrcaDataPreprocess" + elif dataprocesser_type == "SlimOrca": + preprocesser_name = "SlimOrcaDataPreprocess" + else: + raise ValueError(f"there is no {dataprocesser_type} dataprocesser.") + + dataprocesser = common.dataprocesser.DataProcesser.registory.get(preprocesser_name)( config={ "per_device_train_batch_size": config["Training"]["batch_size"], "per_device_eval_batch_size": config["Training"]["batch_size"], @@ -232,6 +242,9 @@ def train_func(config: Dict[str, Any]): "shuffle": config["Dataset"].get("shuffle", False), "name": tokenizer_name, "config": config["General"]["config"], + "gpt_base_model": config["General"].get("gpt_base_model", False), + "chat_template": config["General"]["chat_template"], + "default_chat_template": config["General"]["default_chat_template"], } ) tokenized_datasets = dataprocesser.tokenize_dataset(tokenizer, datasets) From 294161db8cec8324977856d1a362e68575791755 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 17 May 2024 15:45:51 +0800 Subject: [PATCH 23/24] update --- llm_on_ray/finetune/finetune_config.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 3046d96c3..0a25ad777 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -62,20 +62,29 @@ class General(BaseModel): enable_gradient_checkpointing: bool = False chat_template: Optional[str] = None default_chat_template: str = ( - "{{ bos_token }}" "{% if messages[0]['role'] == 'system' %}" - "{{ raise_exception('System role not supported') }}" + "{% set loop_messages = messages[1:] %}" + "{% set system_message = messages[0]['content'] %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set system_message = false %}" "{% endif %}" - "{% for message in messages %}" + "{% for message in loop_messages %}" "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" "{% endif %}" - "{% if message['role'] == 'user' %}" - "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{% if loop.index0 == 0 and system_message %}" + "{{ system_message }}" + "{ % endif %}" + "{ % if message['role'] == 'user' %}" + "{{ '### Instruction: ' + message['content'].strip() }}" "{% elif message['role'] == 'assistant' %}" - "{{ '### Response:' + message['content'] + eos_token }}" - "{% endif %}{% endfor %}" - "{{'### End \n'}}" + "{{ '### Response:' + message['content'].strip() }}" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "{{ '### Response: '}}" + "{% endif %}" ) From c104a3e3ec4743d7169577a724872140d0c5dfdd Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 20 May 2024 16:30:45 +0800 Subject: [PATCH 24/24] update --- .../common/dataprocesser/general_processer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 6f8a9e1e2..c51329718 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -244,8 +244,21 @@ def prepare_dataloader(self, tokenizer, dataset): class SlimOrcaDataPreprocess(ChatDataPreprocess): + chat_template = ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}" + "{{ '### System: ' + message['content'] }}" + "{% elif message['role'] == 'user' %}" + "{{ '### User: ' + message['content'] }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Assistant: ' + message['content'] }}" + "{% endif %}" + "{% endfor %}" + ) + def __init__(self, config): super().__init__(config) + self.config["chat_template"] = self.chat_template self.default_system = "You are a helpful, respectful and honest assistant." def create_data(self, data): @@ -268,18 +281,18 @@ def create_data(self, data): if self.config.get("gpt_base_model"): if examples["human"]: - return PROMPT_WITH_INPUT_FORMAT.format( + return SLIMORCA_PROMPT_DICT["prompt_with_input"].format( instruction=examples["system"], response=examples["gpt"], input=examples["human"], ) else: - return PROMPT_NO_INPUT_FORMAT.format( + return SLIMORCA_PROMPT_DICT["prompt_without_input"].format( instruction=examples["system"], response=examples["gpt"] ) else: new_messages = [ - {"role": "system", "content": INTRO_BLURB + "\n"}, + {"role": "system", "content": examples["system"] + "\n"}, { "role": "user", "content": examples["system"] + "\n" + INPUT_KEY + examples["human"] + "\n",