From 968582eabd78c5caab404cd4af9be7db962b3b0e Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 14 Jun 2024 10:29:24 +0800 Subject: [PATCH 01/15] fix evaluation --- llm_on_ray/finetune/finetune.py | 16 ++++++++++++++-- llm_on_ray/finetune/finetune_config.py | 4 ++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index eb4996cb5..1146ede79 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -140,7 +140,10 @@ def load_tokenizer(config: Dict): else: tokenizer_name = config["General"]["base_model"] load_config = config["General"].get("config", {}) - tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, **load_config) + padding_side = config["General"].get("padding_side") + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, padding_side, **load_config + ) return tokenizer @@ -198,6 +201,9 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): max_length = config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) + return_tensors = config["Dataset"].get("return_tensors", None) + padding = config["Dataset"].get("padding", "Max_length") + truncation = config["Dataset"].get("truncation", True) tokenizer.pad_token = tokenizer.eos_token if isinstance(dataset, datasets.Dataset): @@ -234,7 +240,13 @@ def prompt(rec): column_names += [template.TEXT_COLUMN_NAME] def tokenize_function(examples): - return tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length) + return tokenizer( + examples[template.TEXT_COLUMN_NAME], + padding=padding, + truncation=truncation, + return_tensors=return_tensors, + max_length=max_length, + ) tokenized_dataset = dataset.map( tokenize_function, diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index e1efd0b48..cfeadd21b 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -56,6 +56,7 @@ class General(BaseModel): config: GeneralConfig lora_config: Optional[LoraConfig] = None enable_gradient_checkpointing: bool = False + padding_side: str = "right" @validator("report_to") def check_report_to(cls, v: str): @@ -71,6 +72,9 @@ class Dataset(BaseModel): group: bool = True block_size: int = 512 shuffle: bool = False + return_tensors: Optional[str] = None + padding: str = "Max_length" + truncation: bool = True class RayResourceConfig(BaseModel): From 7d4b8999855f72ab8652fd5c31244e7e4ce872b8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 16 Jun 2024 22:09:18 +0800 Subject: [PATCH 02/15] fix evaluation Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 115 ++++++++++++++++++++------------ llm_on_ray/finetune/template.py | 19 ++---- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 1146ede79..e8636c83f 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -19,6 +19,8 @@ import os import argparse import sys +import copy + from typing import Any, Dict, Union, Optional from itertools import chain @@ -41,6 +43,8 @@ from llm_on_ray.finetune.finetune_config import FinetuneConfig from importlib import util +IGNORE_INDEX = -100 + def adapt_transformers_to_device(config: Dict): device = config["Training"]["device"] @@ -201,53 +205,83 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): max_length = config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) - return_tensors = config["Dataset"].get("return_tensors", None) - padding = config["Dataset"].get("padding", "Max_length") - truncation = config["Dataset"].get("truncation", True) tokenizer.pad_token = tokenizer.eos_token - if isinstance(dataset, datasets.Dataset): - column_names = dataset.column_names - - if isinstance(dataset, datasets.DatasetDict): - column_names = dataset["train"].column_names - - if column_names and template.TEXT_COLUMN_NAME not in column_names: - - def prompt(rec): - instruction = rec["instruction"] - response = rec["response"] - context = rec.get("context") - if not instruction: - raise ValueError(f"Expected an instruction in: {rec}") - if not response: - raise ValueError(f"Expected a response in: {rec}") - if context: - rec["text"] = template.PROMPT_WITH_INPUT_FORMAT.format( - instruction=instruction, response=response, input=context + def prompt(rec, tokenizer): + instruction = rec["instruction"] + response = rec["response"] + context = rec.get("context") + end = tokenizer.eos_token + prompts = {} + prompts["prompt_sources"] = [] + prompts["prompt_targets"] = [] + if not instruction: + raise ValueError(f"Expected an instruction in: {rec}") + if not response: + raise ValueError(f"Expected a response in: {rec}") + if context: + prompts["prompt_sources"] = ( + template.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, input=context) + + end + ) + else: + prompts["prompt_sources"] = ( + template.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction) + end + ) + prompts["prompt_targets"] = template.RESPONSE_FORMAT.format(response=response) + end + return prompts + + for key in dataset: + prompts = prompt(dataset[key], tokenizer) + dataset[key] = datasets.Dataset.from_dict(prompts) + + def tokenize_function(examples): + max_seq_length = max_length + mask_input = False + mask_response = True + keys = list(examples.data.keys()) + if len(keys) != 2: + raise ValueError("Unsupported dataset format") + + examples["input_ids"] = [] + examples["labels"] = [] + examples["attention_mask"] = [] + for s, t in zip(examples[keys[0]], examples[keys[1]]): + results = tokenizer( + s + t, padding=False, truncation=True, return_tensors=None, max_length=max_length + ) + input_ids = results["input_ids"] + [tokenizer.eos_token_id] + input_len = len(input_ids) + labels = copy.deepcopy(input_ids) + # mask input + if mask_input: + sources_tokenized = tokenizer( + s, padding=False, truncation=True, return_tensors=None, max_length=max_length ) - else: - rec["text"] = template.PROMPT_NO_INPUT_FORMAT.format( - instruction=instruction, response=response + input_id_len = len(sources_tokenized["input_ids"]) + labels[:input_id_len] = [IGNORE_INDEX] * input_id_len + # mask response + if mask_response: + sources_tokenized = tokenizer( + s, padding=False, truncation=True, return_tensors=None, max_length=max_length + ) + input_id_len = len(sources_tokenized["input_ids"]) + labels[input_id_len + 1 : len(labels) - 1] = [IGNORE_INDEX] * ( + len(labels) - 1 - input_id_len ) - return rec - dataset = dataset.map( - prompt, - load_from_cache_file=False, - desc="Prompt", - ) - column_names += [template.TEXT_COLUMN_NAME] + # padding + pad_len = max_seq_length - input_len + input_ids = input_ids + [tokenizer.eos_token_id] * pad_len + labels = labels + [IGNORE_INDEX] * pad_len + attention_mask = [1] * input_len + [0] * pad_len + examples["input_ids"].append(input_ids) + examples["labels"].append(labels) + examples["attention_mask"].append(attention_mask) - def tokenize_function(examples): - return tokenizer( - examples[template.TEXT_COLUMN_NAME], - padding=padding, - truncation=truncation, - return_tensors=return_tensors, - max_length=max_length, - ) + return examples + column_names = list(dataset["train"].features) tokenized_dataset = dataset.map( tokenize_function, remove_columns=column_names, @@ -270,7 +304,6 @@ def group_texts(examples): k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } - result["labels"] = result["input_ids"].copy() return result tokenized_dataset = tokenized_dataset.map( diff --git a/llm_on_ray/finetune/template.py b/llm_on_ray/finetune/template.py index cf8647d7f..07bff5c6c 100644 --- a/llm_on_ray/finetune/template.py +++ b/llm_on_ray/finetune/template.py @@ -18,9 +18,8 @@ INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." INSTRUCTION_KEY = "### Instruction:" -INPUT_KEY = "Input:" +INPUT_KEY = "### Input:" RESPONSE_KEY = "### Response:" -END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" PROMPT_NO_INPUT_FORMAT = """{intro} @@ -29,15 +28,16 @@ {instruction} {response_key} -{response} - -{end_key}""".format( +{response}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY, +) + +RESPONSE_FORMAT = """ +{response}""".format( response="{response}", - end_key=END_KEY, ) PROMPT_WITH_INPUT_FORMAT = """{intro} @@ -48,17 +48,12 @@ {input_key} {input} -{response_key} -{response} - -{end_key}""".format( +{response_key}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", input_key=INPUT_KEY, input="{input}", response_key=RESPONSE_KEY, - response="{response}", - end_key=END_KEY, ) TEXT_COLUMN_NAME = "text" From e77efd1b919327861b3657501251d9da9f26fc78 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 16 Jun 2024 22:29:12 +0800 Subject: [PATCH 03/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index e8636c83f..de8a492c4 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -237,8 +237,8 @@ def prompt(rec, tokenizer): def tokenize_function(examples): max_seq_length = max_length - mask_input = False - mask_response = True + mask_input = True + mask_response = False keys = list(examples.data.keys()) if len(keys) != 2: raise ValueError("Unsupported dataset format") @@ -266,15 +266,18 @@ def tokenize_function(examples): s, padding=False, truncation=True, return_tensors=None, max_length=max_length ) input_id_len = len(sources_tokenized["input_ids"]) - labels[input_id_len + 1 : len(labels) - 1] = [IGNORE_INDEX] * ( - len(labels) - 1 - input_id_len - ) + labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) + # padding pad_len = max_seq_length - input_len input_ids = input_ids + [tokenizer.eos_token_id] * pad_len labels = labels + [IGNORE_INDEX] * pad_len attention_mask = [1] * input_len + [0] * pad_len + + assert len(input_ids) == max_seq_length + assert len(labels) == len(input_ids) == len(attention_mask) + examples["input_ids"].append(input_ids) examples["labels"].append(labels) examples["attention_mask"].append(attention_mask) From f97705a4c55348c155559d809e35b6481070c801 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 16 Jun 2024 23:11:47 +0800 Subject: [PATCH 04/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 7 +++---- llm_on_ray/finetune/finetune_config.py | 6 ++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index de8a492c4..442de8bef 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -144,9 +144,8 @@ def load_tokenizer(config: Dict): else: tokenizer_name = config["General"]["base_model"] load_config = config["General"].get("config", {}) - padding_side = config["General"].get("padding_side") tokenizer = transformers.AutoTokenizer.from_pretrained( - tokenizer_name, padding_side, **load_config + tokenizer_name, **load_config ) return tokenizer @@ -205,6 +204,8 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): max_length = config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) + mask_input = config["Dataset"].get("mask_input", False) + mask_response = config["Dataset"].get("mask_response", False) tokenizer.pad_token = tokenizer.eos_token def prompt(rec, tokenizer): @@ -237,8 +238,6 @@ def prompt(rec, tokenizer): def tokenize_function(examples): max_seq_length = max_length - mask_input = True - mask_response = False keys = list(examples.data.keys()) if len(keys) != 2: raise ValueError("Unsupported dataset format") diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index cfeadd21b..203020e91 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -56,7 +56,6 @@ class General(BaseModel): config: GeneralConfig lora_config: Optional[LoraConfig] = None enable_gradient_checkpointing: bool = False - padding_side: str = "right" @validator("report_to") def check_report_to(cls, v: str): @@ -72,9 +71,8 @@ class Dataset(BaseModel): group: bool = True block_size: int = 512 shuffle: bool = False - return_tensors: Optional[str] = None - padding: str = "Max_length" - truncation: bool = True + mask_input: bool = True + mask_response: bool = False class RayResourceConfig(BaseModel): From b641a6507a42e3fda4062f425e722d9017bf75e4 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 16 Jun 2024 23:27:40 +0800 Subject: [PATCH 05/15] format Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 442de8bef..53dc4e610 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -144,9 +144,7 @@ def load_tokenizer(config: Dict): else: tokenizer_name = config["General"]["base_model"] load_config = config["General"].get("config", {}) - tokenizer = transformers.AutoTokenizer.from_pretrained( - tokenizer_name, **load_config - ) + tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, **load_config) return tokenizer @@ -267,7 +265,6 @@ def tokenize_function(examples): input_id_len = len(sources_tokenized["input_ids"]) labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) - # padding pad_len = max_seq_length - input_len input_ids = input_ids + [tokenizer.eos_token_id] * pad_len From 83e8305c20ebd2deeece3710a9cc2bc2a4d4296d Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 16 Jun 2024 23:36:40 +0800 Subject: [PATCH 06/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/template.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm_on_ray/finetune/template.py b/llm_on_ray/finetune/template.py index 07bff5c6c..4cae74636 100644 --- a/llm_on_ray/finetune/template.py +++ b/llm_on_ray/finetune/template.py @@ -27,8 +27,7 @@ {instruction_key} {instruction} -{response_key} -{response}""".format( +{response_key}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", From e74679d1acb16800994324a3c1cecab9c4da4d1c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 17 Jun 2024 17:59:18 +0800 Subject: [PATCH 07/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 92 ++++++++++++++++++-------- llm_on_ray/finetune/finetune_config.py | 3 +- 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 53dc4e610..ca118f42a 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -18,6 +18,7 @@ import os import argparse +import re import sys import copy @@ -199,11 +200,10 @@ def local_load(name, **load_config): def tokenize_dataset(config: Dict, tokenizer, dataset): - max_length = config["Dataset"].get("max_length", 512) + max_seq_length = config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) - mask_input = config["Dataset"].get("mask_input", False) - mask_response = config["Dataset"].get("mask_response", False) + max_source_length = config["Dataset"].get("max_source_length", 384) tokenizer.pad_token = tokenizer.eos_token def prompt(rec, tokenizer): @@ -234,47 +234,83 @@ def prompt(rec, tokenizer): prompts = prompt(dataset[key], tokenizer) dataset[key] = datasets.Dataset.from_dict(prompts) + def truncate_sequences(sequences, max_length): + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40 + """ + words_to_cut = sum(list(map(len, sequences))) - max_length + if words_to_cut <= 0: + return sequences + + while words_to_cut > 0 and len(sequences) > 0: + words_to_cut -= len(sequences[0]) + sequences = sequences[1:] + + return sequences + def tokenize_function(examples): - max_seq_length = max_length - keys = list(examples.data.keys()) - if len(keys) != 2: - raise ValueError("Unsupported dataset format") + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 + The only differences are: + - using our own prompt style + """ + assistant = "### Response:\n" + end = tokenizer.eos_token + assistant_tokens = tokenizer.tokenize(assistant) + header = ( + "Below is an instruction that describes a task. Write a response that appropriately completes the request." + + end + + "\n" + ) + + instructions = [q.strip() for q in examples["prompt_sources"]] + responses = [q.strip() for q in examples["prompt_targets"]] examples["input_ids"] = [] examples["labels"] = [] examples["attention_mask"] = [] - for s, t in zip(examples[keys[0]], examples[keys[1]]): - results = tokenizer( - s + t, padding=False, truncation=True, return_tensors=None, max_length=max_length + + for instruction, response in zip(instructions, responses): + convs = re.findall( + r"### Instruction.*?{0}|### Response.*?{0}".format(end), instruction, re.DOTALL ) - input_ids = results["input_ids"] + [tokenizer.eos_token_id] - input_len = len(input_ids) - labels = copy.deepcopy(input_ids) - # mask input - if mask_input: - sources_tokenized = tokenizer( - s, padding=False, truncation=True, return_tensors=None, max_length=max_length - ) - input_id_len = len(sources_tokenized["input_ids"]) - labels[:input_id_len] = [IGNORE_INDEX] * input_id_len - # mask response - if mask_response: - sources_tokenized = tokenizer( - s, padding=False, truncation=True, return_tensors=None, max_length=max_length - ) - input_id_len = len(sources_tokenized["input_ids"]) - labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) + convs_tokens = [tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs] + header_tokens = tokenizer.tokenize(header) + tokenizer.tokenize("\n") + + max_input = max_source_length - len(header_tokens) - len(assistant_tokens) + + truncated_convs = truncate_sequences(convs_tokens, max_input) + + if len(truncated_convs) == 0: + truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] + + prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens] + prompt_ids = [ + tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens + ] + prompt_ids = list(chain(*prompt_ids)) + + resp_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response.strip())) + # keep last and eos_id + max_resp = max_seq_length - len(prompt_ids) - 1 + if len(resp_ids) > max_resp: + resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] + + input_ids = prompt_ids + resp_ids + [tokenizer.eos_token_id] + labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [tokenizer.eos_token_id] # padding + input_len = len(input_ids) pad_len = max_seq_length - input_len input_ids = input_ids + [tokenizer.eos_token_id] * pad_len labels = labels + [IGNORE_INDEX] * pad_len attention_mask = [1] * input_len + [0] * pad_len assert len(input_ids) == max_seq_length + assert len(prompt_ids) <= max_source_length assert len(labels) == len(input_ids) == len(attention_mask) - examples["input_ids"].append(input_ids) + examples["input_ids"].append(torch.tensor(input_ids)) examples["labels"].append(labels) examples["attention_mask"].append(attention_mask) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 203020e91..a4cbec78f 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -71,8 +71,7 @@ class Dataset(BaseModel): group: bool = True block_size: int = 512 shuffle: bool = False - mask_input: bool = True - mask_response: bool = False + max_source_length: int = 384 class RayResourceConfig(BaseModel): From 42b64c3d8a985931ba03a7c5e5700ad83b339abd Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 18 Jun 2024 20:37:28 +0800 Subject: [PATCH 08/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/DataPreprocess.py | 216 +++++++++++++++++++++++++ llm_on_ray/finetune/finetune.py | 128 +++------------ llm_on_ray/finetune/finetune_config.py | 7 + 3 files changed, 241 insertions(+), 110 deletions(-) create mode 100644 llm_on_ray/finetune/DataPreprocess.py diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/DataPreprocess.py new file mode 100644 index 000000000..0368a5039 --- /dev/null +++ b/llm_on_ray/finetune/DataPreprocess.py @@ -0,0 +1,216 @@ +import copy +import re +from itertools import chain + +import torch + +from llm_on_ray.finetune import template + +IGNORE_INDEX = -100 + + +class AlpacaDataPreprocess: + def __init__(self, eos_token): + self.end = eos_token + + def prompt(self, examples): + prompts = {} + prompts["prompt_sources"] = [] + prompts["prompt_targets"] = [] + for rec in examples: + instruction = rec["instruction"] + response = rec["response"] + context = rec.get("context") + if not instruction: + raise ValueError(f"Expected an instruction in: {rec}") + if not response: + raise ValueError(f"Expected a response in: {rec}") + if context: + prompt = ( + template.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, input=context) + + self.end + ) + prompts["prompt_sources"].append(prompt) + else: + prompt = template.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction) + self.end + prompts["prompt_sources"].append(prompt) + prompt_response = template.RESPONSE_FORMAT.format(response=response) + self.end + prompts["prompt_targets"].append(prompt_response) + prompt += prompt_response + "\n" + + return prompts + + def tokenize_func(self, tokenizer, config): + padding_side = config["Dataset"].get("padding_side", "right") + config["Dataset"].get("truncation_side", "right") + max_length = max_source_length = config["Dataset"].get("max_length", 512) + max_seq_length = config["Dataset"].get("max_seq_length", 1024) + truncation = config["Dataset"].get("truncation", True) + padding = config["Dataset"].get("padding", True) + mask_input = config["Dataset"].get("mask_input", True) + mask_response = config["Dataset"].get("mask_response", True) + + def truncate_sequences(sequences, max_length): + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40 + """ + words_to_cut = sum(list(map(len, sequences))) - max_length + if words_to_cut <= 0: + return sequences + + while words_to_cut > 0 and len(sequences) > 0: + words_to_cut -= len(sequences[0]) + sequences = sequences[1:] + + return sequences + + def preprocess_function_with_tokenize(examples): + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 + The only differences are: + - using our own prompt style + """ + print("preprocess_function_with_tokenize") + keys = list(examples.data.keys()) + if len(keys) != 2: + raise ValueError("Unsupported dataset format") + assistant = "### Response:\n" + end = tokenizer.eos_token + assistant_tokens = tokenizer.tokenize(assistant) + header = ( + "Below is an instruction that describes a task. Write a response that appropriately completes the request." + + end + + "\n" + ) + print(examples["prompt_sources"]) + instructions = [q.strip() for q in examples["prompt_sources"]] + print(instructions) + [q.strip() for q in examples["prompt_targets"]] + + examples["input_ids"] = [] + examples["labels"] = [] + examples["attention_mask"] = [] + + for instruction, response in zip(examples[keys[0]], examples[keys[1]]): + print("instruction") + print(instruction) + convs = re.findall( + r"### Instruction.*?{0}|### Response.*?{0}".format(end), instruction, re.DOTALL + ) + print(convs) + convs_tokens = [ + tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs + ] + header_tokens = tokenizer.tokenize(header) + tokenizer.tokenize("\n") + + max_input = max_source_length - len(header_tokens) - len(assistant_tokens) + + truncated_convs = truncate_sequences(convs_tokens, max_input) + + if len(truncated_convs) == 0: + truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] + + prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens] + prompt_ids = [ + tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens + ] + prompt_ids = list(chain(*prompt_ids)) + + resp_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response.strip())) + # keep last and eos_id + max_resp = max_seq_length - len(prompt_ids) - 1 + if len(resp_ids) > max_resp: + resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] + + # masking + input_ids = prompt_ids + resp_ids + [tokenizer.eos_token_id] + if mask_input: + labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [tokenizer.eos_token_id] + elif mask_response: + labels = prompt_ids + [IGNORE_INDEX] * len(resp_ids) + [tokenizer.eos_token_id] + else: + labels = input_ids + + # padding + input_len = len(input_ids) + pad_len = max_seq_length - input_len + if padding_side == "right": + input_ids = input_ids + [tokenizer.eos_token_id] * pad_len + labels = labels + [IGNORE_INDEX] * pad_len + attention_mask = [1] * input_len + [0] * pad_len + else: + input_ids = [tokenizer.eos_token_id] * pad_len + input_ids + labels = [IGNORE_INDEX] * pad_len + labels + attention_mask = [0] * pad_len + [1] * input_len + + assert len(input_ids) == max_seq_length + assert len(prompt_ids) <= max_source_length + assert len(labels) == len(input_ids) == len(attention_mask) + + examples["input_ids"].append(torch.tensor(input_ids)) + examples["labels"].append(labels) + examples["attention_mask"].append(attention_mask) + + return examples + + def preprocess_function_with_tokenizer(examples): + keys = list(examples.data.keys()) + if len(keys) != 2: + raise ValueError("Unsupported dataset format") + + examples["input_ids"] = [] + examples["labels"] = [] + examples["attention_mask"] = [] + for s, t in zip(examples[keys[0]], examples[keys[1]]): + if padding is False: + results = tokenizer( + s + t, + padding=False, + truncation=True, + return_tensors=None, + max_length=max_length, + ) + input_ids = results["input_ids"] + input_len = len(input_ids) + labels = copy.deepcopy(input_ids) + # mask input + if mask_input: + sources_tokenized = tokenizer( + s, + padding=False, + truncation=True, + return_tensors=None, + max_length=max_length, + ) + input_id_len = len(sources_tokenized["input_ids"]) + labels[:input_id_len] = [IGNORE_INDEX] * input_id_len + if mask_response: + sources_tokenized = tokenizer( + s, + padding=False, + truncation=True, + return_tensors=None, + max_length=max_length, + ) + input_id_len = len(sources_tokenized["input_ids"]) + + labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) + else: + results = tokenizer( + s + t, + padding=padding, + truncation=truncation, + return_tensors=None, + max_length=max_length, + ) + input_ids = results["input_ids"] + labels = copy.deepcopy(input_ids) + + attention_mask = results["attention_mask"] + examples["input_ids"].append(input_ids) + examples["labels"].append(labels) + examples["attention_mask"].append(attention_mask) + + return examples + + return preprocess_function_with_tokenize diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index ca118f42a..86cd84000 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -41,6 +41,7 @@ from llm_on_ray import common from llm_on_ray.finetune import template +from llm_on_ray.finetune.DataPreprocess import AlpacaDataPreprocess from llm_on_ray.finetune.finetune_config import FinetuneConfig from importlib import util @@ -145,7 +146,13 @@ def load_tokenizer(config: Dict): else: tokenizer_name = config["General"]["base_model"] load_config = config["General"].get("config", {}) - tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, **load_config) + # default padding side is right + padding_side = config["Dataset"].get("padding_side", "right") + # default truncation side is right + truncation_side = config["Dataset"].get("truncation_side", "right") + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, padding_side=padding_side, truncation_side=truncation_side, **load_config + ) return tokenizer @@ -200,125 +207,26 @@ def local_load(name, **load_config): def tokenize_dataset(config: Dict, tokenizer, dataset): - max_seq_length = config["Dataset"].get("max_length", 512) + config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) - max_source_length = config["Dataset"].get("max_source_length", 384) + config["Dataset"].get("max_source_length", 384) tokenizer.pad_token = tokenizer.eos_token - def prompt(rec, tokenizer): - instruction = rec["instruction"] - response = rec["response"] - context = rec.get("context") - end = tokenizer.eos_token - prompts = {} - prompts["prompt_sources"] = [] - prompts["prompt_targets"] = [] - if not instruction: - raise ValueError(f"Expected an instruction in: {rec}") - if not response: - raise ValueError(f"Expected a response in: {rec}") - if context: - prompts["prompt_sources"] = ( - template.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, input=context) - + end - ) - else: - prompts["prompt_sources"] = ( - template.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction) + end - ) - prompts["prompt_targets"] = template.RESPONSE_FORMAT.format(response=response) + end - return prompts + preprocess = AlpacaDataPreprocess(tokenizer.eos_token) + print(dataset) for key in dataset: - prompts = prompt(dataset[key], tokenizer) + prompts = preprocess.prompt(dataset[key]) dataset[key] = datasets.Dataset.from_dict(prompts) - def truncate_sequences(sequences, max_length): - """ - Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40 - """ - words_to_cut = sum(list(map(len, sequences))) - max_length - if words_to_cut <= 0: - return sequences - - while words_to_cut > 0 and len(sequences) > 0: - words_to_cut -= len(sequences[0]) - sequences = sequences[1:] - - return sequences - - def tokenize_function(examples): - """ - Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 - The only differences are: - - using our own prompt style - """ - assistant = "### Response:\n" - end = tokenizer.eos_token - assistant_tokens = tokenizer.tokenize(assistant) - header = ( - "Below is an instruction that describes a task. Write a response that appropriately completes the request." - + end - + "\n" - ) - - instructions = [q.strip() for q in examples["prompt_sources"]] - responses = [q.strip() for q in examples["prompt_targets"]] - - examples["input_ids"] = [] - examples["labels"] = [] - examples["attention_mask"] = [] - - for instruction, response in zip(instructions, responses): - convs = re.findall( - r"### Instruction.*?{0}|### Response.*?{0}".format(end), instruction, re.DOTALL - ) - convs_tokens = [tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs] - header_tokens = tokenizer.tokenize(header) + tokenizer.tokenize("\n") - - max_input = max_source_length - len(header_tokens) - len(assistant_tokens) - - truncated_convs = truncate_sequences(convs_tokens, max_input) - - if len(truncated_convs) == 0: - truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] - - prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens] - prompt_ids = [ - tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens - ] - prompt_ids = list(chain(*prompt_ids)) - - resp_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response.strip())) - # keep last and eos_id - max_resp = max_seq_length - len(prompt_ids) - 1 - if len(resp_ids) > max_resp: - resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] - - input_ids = prompt_ids + resp_ids + [tokenizer.eos_token_id] - labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [tokenizer.eos_token_id] - - # padding - input_len = len(input_ids) - pad_len = max_seq_length - input_len - input_ids = input_ids + [tokenizer.eos_token_id] * pad_len - labels = labels + [IGNORE_INDEX] * pad_len - attention_mask = [1] * input_len + [0] * pad_len - - assert len(input_ids) == max_seq_length - assert len(prompt_ids) <= max_source_length - assert len(labels) == len(input_ids) == len(attention_mask) - - examples["input_ids"].append(torch.tensor(input_ids)) - examples["labels"].append(labels) - examples["attention_mask"].append(attention_mask) - - return examples - column_names = list(dataset["train"].features) + print(dataset) + + print(column_names) + preprocess_fn = preprocess.tokenize_func(tokenizer, config) tokenized_dataset = dataset.map( - tokenize_function, + preprocess_fn, remove_columns=column_names, load_from_cache_file=False, desc="Tokenize dataset", diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index a4cbec78f..c53fd1081 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -72,6 +72,13 @@ class Dataset(BaseModel): block_size: int = 512 shuffle: bool = False max_source_length: int = 384 + padding_side: str = "right" + truncation_side: str = "right" + max_seq_length: int = 1024 + truncation: bool = True + padding: bool = True + mask_input: bool = True + mask_response: bool = True class RayResourceConfig(BaseModel): From fe78c037f15255c1eaccc97de79ef67ca7324964 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 18 Jun 2024 22:54:16 +0800 Subject: [PATCH 09/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/DataPreprocess.py | 46 ++++++++------------- llm_on_ray/finetune/finetune.py | 8 +--- llm_on_ray/finetune/template.py | 58 --------------------------- 3 files changed, 17 insertions(+), 95 deletions(-) delete mode 100644 llm_on_ray/finetune/template.py diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/DataPreprocess.py index 0368a5039..b0929e0e8 100644 --- a/llm_on_ray/finetune/DataPreprocess.py +++ b/llm_on_ray/finetune/DataPreprocess.py @@ -4,14 +4,16 @@ import torch -from llm_on_ray.finetune import template - IGNORE_INDEX = -100 class AlpacaDataPreprocess: def __init__(self, eos_token): self.end = eos_token + self.intro = "Below is an instruction that describes a task. Write a response that appropriately completes the request." + self.instruction = "### Instruction:\n" + self.input = "### Input:\n" + self.response = "### Response:\n" def prompt(self, examples): prompts = {} @@ -26,23 +28,18 @@ def prompt(self, examples): if not response: raise ValueError(f"Expected a response in: {rec}") if context: - prompt = ( - template.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, input=context) - + self.end - ) + prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.input + context + self.end + "\n" + self.response prompts["prompt_sources"].append(prompt) else: - prompt = template.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction) + self.end + prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.end + "\n" + self.response prompts["prompt_sources"].append(prompt) - prompt_response = template.RESPONSE_FORMAT.format(response=response) + self.end + prompt_response = response + self.end prompts["prompt_targets"].append(prompt_response) - prompt += prompt_response + "\n" - return prompts def tokenize_func(self, tokenizer, config): padding_side = config["Dataset"].get("padding_side", "right") - config["Dataset"].get("truncation_side", "right") + truncation_side = config["Dataset"].get("truncation_side", "right") max_length = max_source_length = config["Dataset"].get("max_length", 512) max_seq_length = config["Dataset"].get("max_seq_length", 1024) truncation = config["Dataset"].get("truncation", True) @@ -61,7 +58,6 @@ def truncate_sequences(sequences, max_length): while words_to_cut > 0 and len(sequences) > 0: words_to_cut -= len(sequences[0]) sequences = sequences[1:] - return sequences def preprocess_function_with_tokenize(examples): @@ -70,43 +66,29 @@ def preprocess_function_with_tokenize(examples): The only differences are: - using our own prompt style """ - print("preprocess_function_with_tokenize") keys = list(examples.data.keys()) if len(keys) != 2: raise ValueError("Unsupported dataset format") - assistant = "### Response:\n" - end = tokenizer.eos_token - assistant_tokens = tokenizer.tokenize(assistant) + assistant_tokens = tokenizer.tokenize(self.response) header = ( "Below is an instruction that describes a task. Write a response that appropriately completes the request." - + end + + self.end + "\n" ) - print(examples["prompt_sources"]) - instructions = [q.strip() for q in examples["prompt_sources"]] - print(instructions) - [q.strip() for q in examples["prompt_targets"]] examples["input_ids"] = [] examples["labels"] = [] examples["attention_mask"] = [] - for instruction, response in zip(examples[keys[0]], examples[keys[1]]): - print("instruction") - print(instruction) convs = re.findall( - r"### Instruction.*?{0}|### Response.*?{0}".format(end), instruction, re.DOTALL + r"### Instruction.*?{0}|### Response.*?{0}".format(self.end), instruction, re.DOTALL ) - print(convs) convs_tokens = [ tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs ] header_tokens = tokenizer.tokenize(header) + tokenizer.tokenize("\n") - max_input = max_source_length - len(header_tokens) - len(assistant_tokens) - truncated_convs = truncate_sequences(convs_tokens, max_input) - if len(truncated_convs) == 0: truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] @@ -119,8 +101,12 @@ def preprocess_function_with_tokenize(examples): resp_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response.strip())) # keep last and eos_id max_resp = max_seq_length - len(prompt_ids) - 1 + if len(resp_ids) > max_resp: - resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] + if truncation_side == "right": + resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] + else: + resp_ids = resp_ids[-max_resp:] # masking input_ids = prompt_ids + resp_ids + [tokenizer.eos_token_id] diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 86cd84000..6c31ab16d 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -40,7 +40,6 @@ from pydantic_yaml import parse_yaml_raw_as from llm_on_ray import common -from llm_on_ray.finetune import template from llm_on_ray.finetune.DataPreprocess import AlpacaDataPreprocess from llm_on_ray.finetune.finetune_config import FinetuneConfig from importlib import util @@ -207,27 +206,22 @@ def local_load(name, **load_config): def tokenize_dataset(config: Dict, tokenizer, dataset): - config["Dataset"].get("max_length", 512) group = config["Dataset"].get("group", True) block_size = config["Dataset"].get("block_size", 512) - config["Dataset"].get("max_source_length", 384) tokenizer.pad_token = tokenizer.eos_token preprocess = AlpacaDataPreprocess(tokenizer.eos_token) - print(dataset) for key in dataset: prompts = preprocess.prompt(dataset[key]) dataset[key] = datasets.Dataset.from_dict(prompts) column_names = list(dataset["train"].features) - print(dataset) - - print(column_names) preprocess_fn = preprocess.tokenize_func(tokenizer, config) tokenized_dataset = dataset.map( preprocess_fn, remove_columns=column_names, + batched=True, load_from_cache_file=False, desc="Tokenize dataset", ) diff --git a/llm_on_ray/finetune/template.py b/llm_on_ray/finetune/template.py deleted file mode 100644 index 4cae74636..000000000 --- a/llm_on_ray/finetune/template.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright 2023 The LLM-on-Ray Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -#!/usr/bin/env python - -INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." -INSTRUCTION_KEY = "### Instruction:" -INPUT_KEY = "### Input:" -RESPONSE_KEY = "### Response:" -RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" - -PROMPT_NO_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{response_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - response_key=RESPONSE_KEY, -) - -RESPONSE_FORMAT = """ -{response}""".format( - response="{response}", -) - -PROMPT_WITH_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{input_key} -{input} - -{response_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - input_key=INPUT_KEY, - input="{input}", - response_key=RESPONSE_KEY, -) -TEXT_COLUMN_NAME = "text" From 66ba1c8e1a1eb304eee53fbdf23f79d856249e2c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 19 Jun 2024 13:28:00 +0800 Subject: [PATCH 10/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/DataPreprocess.py | 87 +++++++++++++++------------ 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/DataPreprocess.py index b0929e0e8..ef2e2f15e 100644 --- a/llm_on_ray/finetune/DataPreprocess.py +++ b/llm_on_ray/finetune/DataPreprocess.py @@ -28,10 +28,30 @@ def prompt(self, examples): if not response: raise ValueError(f"Expected a response in: {rec}") if context: - prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.input + context + self.end + "\n" + self.response + prompt = ( + self.intro + + self.end + + "\n" + + self.instruction + + instruction + + self.input + + context + + self.end + + "\n" + + self.response + ) prompts["prompt_sources"].append(prompt) else: - prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.end + "\n" + self.response + prompt = ( + self.intro + + self.end + + "\n" + + self.instruction + + instruction + + self.end + + "\n" + + self.response + ) prompts["prompt_sources"].append(prompt) prompt_response = response + self.end prompts["prompt_targets"].append(prompt_response) @@ -65,6 +85,8 @@ def preprocess_function_with_tokenize(examples): Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 The only differences are: - using our own prompt style + - add left or right padding and truncation + - add mask_input and mask_response """ keys = list(examples.data.keys()) if len(keys) != 2: @@ -81,7 +103,9 @@ def preprocess_function_with_tokenize(examples): examples["attention_mask"] = [] for instruction, response in zip(examples[keys[0]], examples[keys[1]]): convs = re.findall( - r"### Instruction.*?{0}|### Response.*?{0}".format(self.end), instruction, re.DOTALL + r"### Instruction.*?{0}|### Response.*?{0}".format(self.end), + instruction, + re.DOTALL, ) convs_tokens = [ tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs @@ -102,6 +126,7 @@ def preprocess_function_with_tokenize(examples): # keep last and eos_id max_resp = max_seq_length - len(prompt_ids) - 1 + # truncating response if len(resp_ids) > max_resp: if truncation_side == "right": resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] @@ -148,55 +173,37 @@ def preprocess_function_with_tokenizer(examples): examples["labels"] = [] examples["attention_mask"] = [] for s, t in zip(examples[keys[0]], examples[keys[1]]): - if padding is False: - results = tokenizer( - s + t, + results = tokenizer( + s + t, + padding=padding, + truncation=truncation, + return_tensors=None, + max_length=max_length, + ) + + input_ids = results["input_ids"] + input_len = len(input_ids) + labels = copy.deepcopy(input_ids) + if mask_input or mask_response: + sources_tokenized = tokenizer( + s, padding=False, truncation=True, return_tensors=None, max_length=max_length, ) - input_ids = results["input_ids"] - input_len = len(input_ids) - labels = copy.deepcopy(input_ids) + input_id_len = len(sources_tokenized["input_ids"]) # mask input if mask_input: - sources_tokenized = tokenizer( - s, - padding=False, - truncation=True, - return_tensors=None, - max_length=max_length, - ) - input_id_len = len(sources_tokenized["input_ids"]) labels[:input_id_len] = [IGNORE_INDEX] * input_id_len + # mask response if mask_response: - sources_tokenized = tokenizer( - s, - padding=False, - truncation=True, - return_tensors=None, - max_length=max_length, - ) - input_id_len = len(sources_tokenized["input_ids"]) - labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) - else: - results = tokenizer( - s + t, - padding=padding, - truncation=truncation, - return_tensors=None, - max_length=max_length, - ) - input_ids = results["input_ids"] - labels = copy.deepcopy(input_ids) - attention_mask = results["attention_mask"] - examples["input_ids"].append(input_ids) + examples["input_ids"].append(results["input_ids"]) examples["labels"].append(labels) - examples["attention_mask"].append(attention_mask) - + examples["attention_mask"].append(results["attention_mask"]) return examples + return preprocess_function_with_tokenize From c1136b9ca21874fc0c5915fa9fc78974a466012b Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 20 Jun 2024 14:57:26 +0800 Subject: [PATCH 11/15] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/DataPreprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/DataPreprocess.py index ef2e2f15e..f6326519a 100644 --- a/llm_on_ray/finetune/DataPreprocess.py +++ b/llm_on_ray/finetune/DataPreprocess.py @@ -205,5 +205,4 @@ def preprocess_function_with_tokenizer(examples): examples["attention_mask"].append(results["attention_mask"]) return examples - return preprocess_function_with_tokenize From 268067d50b9697895d790dc2f60e95c3d18dd47e Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 20 Jun 2024 15:12:19 +0800 Subject: [PATCH 12/15] fix license-header Signed-off-by: minmingzhu --- llm_on_ray/finetune/DataPreprocess.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/DataPreprocess.py index f6326519a..46fe85e3f 100644 --- a/llm_on_ray/finetune/DataPreprocess.py +++ b/llm_on_ray/finetune/DataPreprocess.py @@ -1,3 +1,19 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import copy import re from itertools import chain From cc94c4a6b1d033f191469b5ddceddb3d107f86d9 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 21 Jun 2024 16:14:52 +0800 Subject: [PATCH 13/15] 1. update doc 2. fix comments Signed-off-by: minmingzhu --- docs/finetune_parameters.md | 29 ++++++++++------ .../{DataPreprocess.py => data_preprocess.py} | 33 +++++++++++-------- llm_on_ray/finetune/finetune.py | 9 ++--- llm_on_ray/finetune/finetune_config.py | 3 +- 4 files changed, 43 insertions(+), 31 deletions(-) rename llm_on_ray/finetune/{DataPreprocess.py => data_preprocess.py} (87%) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 55c303e70..94c468032 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -19,16 +19,25 @@ The following are the parameters supported in the finetuning workflow. ## Dataset Parameters -|Configuration Name| Default|Meaning| -|-|-|-| -|train_file|examples/data/sample_finetune_data.jsonl|A json file containing the training data.| -|validation_file|None|A json file containing the validation data.| -|validation_split_percentage|5|The percentage of the train set used as validation set in case there's no validation split| -|preprocessing_num_workers|None|The number of processes to use for the preprocessing.| -|max_length|512|Padding sequential data to max length of a batch| -|group|True|Whether to concatenate the sentence for more efficient training| -|block_size|512|The block size of concatenated sentence| -|shuffle|False|Whether shuffle the data at every epoch| +| Configuration Name | Default| Meaning | +|-----------------------------|-|------------------------------------------------------------------------------------------------------------------------------------------| +| train_file |examples/data/sample_finetune_data.jsonl| A json file containing the training data. | +| validation_file |None| A json file containing the validation data. | +| validation_split_percentage |5| The percentage of the train set used as validation set in case there's no validation split | +| preprocessing_num_workers |None| The number of processes to use for the preprocessing. | +| max_length |512| Padding sequential data to max length of a batch | +| group |True| Whether to concatenate the sentence for more efficient training | +| block_size |512| The block size of concatenated sentence | +| shuffle |False| Whether shuffle the data at every epoch | +| max_source_length |384| The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded. | +| padding_side |right| The side on which the model should have padding applied. Should be selected between ['right', 'left']. | +| truncation_side |right| The side on which the model should have truncation applied. Should be selected between ['right', 'left']. | +| max_seq_length |max_length| The maximum total input sequence length after tokenization. | +| truncation |True| truncation strategy. Should be selected between ['only_first', 'only_second', 'longest_first/True', 'do_not_truncate/False']. | +| padding |True| padding strategy. Should be selected between ['longest/True', 'do_not_pad/False', 'max_length'] +| mask_input |True| mask the input part in lables | +| mask_response |True| mask the response part in lables | +| data_preprocess_type |neural_chat| The type of the encode input | ## Training Parameters diff --git a/llm_on_ray/finetune/DataPreprocess.py b/llm_on_ray/finetune/data_preprocess.py similarity index 87% rename from llm_on_ray/finetune/DataPreprocess.py rename to llm_on_ray/finetune/data_preprocess.py index 46fe85e3f..8691f4c27 100644 --- a/llm_on_ray/finetune/DataPreprocess.py +++ b/llm_on_ray/finetune/data_preprocess.py @@ -23,8 +23,10 @@ IGNORE_INDEX = -100 -class AlpacaDataPreprocess: - def __init__(self, eos_token): +class DataPreprocess: + # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release) + def __init__(self, config, eos_token): + self.config = config self.end = eos_token self.intro = "Below is an instruction that describes a task. Write a response that appropriately completes the request." self.instruction = "### Instruction:\n" @@ -73,15 +75,15 @@ def prompt(self, examples): prompts["prompt_targets"].append(prompt_response) return prompts - def tokenize_func(self, tokenizer, config): - padding_side = config["Dataset"].get("padding_side", "right") - truncation_side = config["Dataset"].get("truncation_side", "right") - max_length = max_source_length = config["Dataset"].get("max_length", 512) - max_seq_length = config["Dataset"].get("max_seq_length", 1024) - truncation = config["Dataset"].get("truncation", True) - padding = config["Dataset"].get("padding", True) - mask_input = config["Dataset"].get("mask_input", True) - mask_response = config["Dataset"].get("mask_response", True) + def tokenize(self, tokenizer): + padding_side = self.config["Dataset"].get("padding_side", "right") + truncation_side = self.config["Dataset"].get("truncation_side", "right") + max_length = max_seq_length = self.config["Dataset"].get("max_length", 512) + max_source_length = self.config["Dataset"].get("max_source_length", 384) + truncation = self.config["Dataset"].get("truncation", True) + padding = self.config["Dataset"].get("padding", True) + mask_input = self.config["Dataset"].get("mask_input", True) + mask_response = self.config["Dataset"].get("mask_response", True) def truncate_sequences(sequences, max_length): """ @@ -96,7 +98,7 @@ def truncate_sequences(sequences, max_length): sequences = sequences[1:] return sequences - def preprocess_function_with_tokenize(examples): + def preprocess_function_with_neural_chat(examples): """ Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 The only differences are: @@ -180,7 +182,7 @@ def preprocess_function_with_tokenize(examples): return examples - def preprocess_function_with_tokenizer(examples): + def preprocess_function_encode_inputs(examples): keys = list(examples.data.keys()) if len(keys) != 2: raise ValueError("Unsupported dataset format") @@ -221,4 +223,7 @@ def preprocess_function_with_tokenizer(examples): examples["attention_mask"].append(results["attention_mask"]) return examples - return preprocess_function_with_tokenize + if self.config["Dataset"].get("data_preprocess_type", "neural_chat") == "neural_chat": + return preprocess_function_with_neural_chat + + return preprocess_function_encode_inputs diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 6c31ab16d..3b551c4e3 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -40,11 +40,8 @@ from pydantic_yaml import parse_yaml_raw_as from llm_on_ray import common -from llm_on_ray.finetune.DataPreprocess import AlpacaDataPreprocess +from llm_on_ray.finetune.data_preprocess import DataPreprocess from llm_on_ray.finetune.finetune_config import FinetuneConfig -from importlib import util - -IGNORE_INDEX = -100 def adapt_transformers_to_device(config: Dict): @@ -210,14 +207,14 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): block_size = config["Dataset"].get("block_size", 512) tokenizer.pad_token = tokenizer.eos_token - preprocess = AlpacaDataPreprocess(tokenizer.eos_token) + preprocess = DataPreprocess(config, tokenizer.eos_token) for key in dataset: prompts = preprocess.prompt(dataset[key]) dataset[key] = datasets.Dataset.from_dict(prompts) column_names = list(dataset["train"].features) - preprocess_fn = preprocess.tokenize_func(tokenizer, config) + preprocess_fn = preprocess.tokenize(tokenizer) tokenized_dataset = dataset.map( preprocess_fn, remove_columns=column_names, diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index c53fd1081..27bbe3cd5 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -74,11 +74,12 @@ class Dataset(BaseModel): max_source_length: int = 384 padding_side: str = "right" truncation_side: str = "right" - max_seq_length: int = 1024 + max_seq_length: int = 512 truncation: bool = True padding: bool = True mask_input: bool = True mask_response: bool = True + data_preprocess_type: str = "neural_chat" class RayResourceConfig(BaseModel): From a633a134b9192ee4387f9b39b5f29e9a93d753f8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 24 Jun 2024 13:50:31 +0800 Subject: [PATCH 14/15] fix comments Signed-off-by: minmingzhu --- .../finetune/{data_preprocess.py => data_process.py} | 12 ++++-------- llm_on_ray/finetune/finetune.py | 10 +++++----- 2 files changed, 9 insertions(+), 13 deletions(-) rename llm_on_ray/finetune/{data_preprocess.py => data_process.py} (96%) diff --git a/llm_on_ray/finetune/data_preprocess.py b/llm_on_ray/finetune/data_process.py similarity index 96% rename from llm_on_ray/finetune/data_preprocess.py rename to llm_on_ray/finetune/data_process.py index 8691f4c27..a39ed3565 100644 --- a/llm_on_ray/finetune/data_preprocess.py +++ b/llm_on_ray/finetune/data_process.py @@ -23,7 +23,7 @@ IGNORE_INDEX = -100 -class DataPreprocess: +class DataProcessor: # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release) def __init__(self, config, eos_token): self.config = config @@ -33,7 +33,7 @@ def __init__(self, config, eos_token): self.input = "### Input:\n" self.response = "### Response:\n" - def prompt(self, examples): + def make_prompt(self, examples): prompts = {} prompts["prompt_sources"] = [] prompts["prompt_targets"] = [] @@ -110,18 +110,14 @@ def preprocess_function_with_neural_chat(examples): if len(keys) != 2: raise ValueError("Unsupported dataset format") assistant_tokens = tokenizer.tokenize(self.response) - header = ( - "Below is an instruction that describes a task. Write a response that appropriately completes the request." - + self.end - + "\n" - ) + header = self.intro + self.end + "\n" examples["input_ids"] = [] examples["labels"] = [] examples["attention_mask"] = [] for instruction, response in zip(examples[keys[0]], examples[keys[1]]): convs = re.findall( - r"### Instruction.*?{0}|### Response.*?{0}".format(self.end), + r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end), instruction, re.DOTALL, ) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 3b551c4e3..6bb3c017c 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -40,7 +40,7 @@ from pydantic_yaml import parse_yaml_raw_as from llm_on_ray import common -from llm_on_ray.finetune.data_preprocess import DataPreprocess +from llm_on_ray.finetune.data_process import DataProcessor from llm_on_ray.finetune.finetune_config import FinetuneConfig @@ -207,16 +207,16 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): block_size = config["Dataset"].get("block_size", 512) tokenizer.pad_token = tokenizer.eos_token - preprocess = DataPreprocess(config, tokenizer.eos_token) + processor = DataProcessor(config, tokenizer.eos_token) for key in dataset: - prompts = preprocess.prompt(dataset[key]) + prompts = processor.make_prompt(dataset[key]) dataset[key] = datasets.Dataset.from_dict(prompts) column_names = list(dataset["train"].features) - preprocess_fn = preprocess.tokenize(tokenizer) + processor_fn = processor.tokenize(tokenizer) tokenized_dataset = dataset.map( - preprocess_fn, + processor_fn, remove_columns=column_names, batched=True, load_from_cache_file=False, From d3c99ea1d3473c3489e73c060dc0f3712bfe14d2 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 24 Jun 2024 17:29:48 +0800 Subject: [PATCH 15/15] fix comments Signed-off-by: minmingzhu --- llm_on_ray/finetune/data_process.py | 287 ++++++++++++++-------------- llm_on_ray/finetune/finetune.py | 11 +- 2 files changed, 149 insertions(+), 149 deletions(-) diff --git a/llm_on_ray/finetune/data_process.py b/llm_on_ray/finetune/data_process.py index a39ed3565..6435928a1 100644 --- a/llm_on_ray/finetune/data_process.py +++ b/llm_on_ray/finetune/data_process.py @@ -25,13 +25,21 @@ class DataProcessor: # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release) - def __init__(self, config, eos_token): - self.config = config - self.end = eos_token + def __init__(self, config, tokenizer): + self.tokenizer = tokenizer + self.end = tokenizer.eos_token self.intro = "Below is an instruction that describes a task. Write a response that appropriately completes the request." self.instruction = "### Instruction:\n" self.input = "### Input:\n" self.response = "### Response:\n" + self.padding_side = config["Dataset"].get("padding_side", "right") + self.truncation_side = config["Dataset"].get("truncation_side", "right") + self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512) + self.max_source_length = config["Dataset"].get("max_source_length", 384) + self.truncation = config["Dataset"].get("truncation", True) + self.padding = config["Dataset"].get("padding", True) + self.mask_input = config["Dataset"].get("mask_input", True) + self.mask_response = config["Dataset"].get("mask_response", True) def make_prompt(self, examples): prompts = {} @@ -75,151 +83,138 @@ def make_prompt(self, examples): prompts["prompt_targets"].append(prompt_response) return prompts - def tokenize(self, tokenizer): - padding_side = self.config["Dataset"].get("padding_side", "right") - truncation_side = self.config["Dataset"].get("truncation_side", "right") - max_length = max_seq_length = self.config["Dataset"].get("max_length", 512) - max_source_length = self.config["Dataset"].get("max_source_length", 384) - truncation = self.config["Dataset"].get("truncation", True) - padding = self.config["Dataset"].get("padding", True) - mask_input = self.config["Dataset"].get("mask_input", True) - mask_response = self.config["Dataset"].get("mask_response", True) - - def truncate_sequences(sequences, max_length): - """ - Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40 - """ - words_to_cut = sum(list(map(len, sequences))) - max_length - if words_to_cut <= 0: - return sequences - - while words_to_cut > 0 and len(sequences) > 0: - words_to_cut -= len(sequences[0]) - sequences = sequences[1:] + def __truncate_sequences(self, sequences, max_length): + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40 + """ + words_to_cut = sum(list(map(len, sequences))) - max_length + if words_to_cut <= 0: return sequences - def preprocess_function_with_neural_chat(examples): - """ - Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 - The only differences are: - - using our own prompt style - - add left or right padding and truncation - - add mask_input and mask_response - """ - keys = list(examples.data.keys()) - if len(keys) != 2: - raise ValueError("Unsupported dataset format") - assistant_tokens = tokenizer.tokenize(self.response) - header = self.intro + self.end + "\n" - - examples["input_ids"] = [] - examples["labels"] = [] - examples["attention_mask"] = [] - for instruction, response in zip(examples[keys[0]], examples[keys[1]]): - convs = re.findall( - r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end), - instruction, - re.DOTALL, - ) - convs_tokens = [ - tokenizer.tokenize(conv) + tokenizer.tokenize("\n") for conv in convs - ] - header_tokens = tokenizer.tokenize(header) + tokenizer.tokenize("\n") - max_input = max_source_length - len(header_tokens) - len(assistant_tokens) - truncated_convs = truncate_sequences(convs_tokens, max_input) - if len(truncated_convs) == 0: - truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] - - prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens] - prompt_ids = [ - tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens - ] - prompt_ids = list(chain(*prompt_ids)) - - resp_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response.strip())) - # keep last and eos_id - max_resp = max_seq_length - len(prompt_ids) - 1 - - # truncating response - if len(resp_ids) > max_resp: - if truncation_side == "right": - resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] - else: - resp_ids = resp_ids[-max_resp:] - - # masking - input_ids = prompt_ids + resp_ids + [tokenizer.eos_token_id] - if mask_input: - labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [tokenizer.eos_token_id] - elif mask_response: - labels = prompt_ids + [IGNORE_INDEX] * len(resp_ids) + [tokenizer.eos_token_id] - else: - labels = input_ids - - # padding - input_len = len(input_ids) - pad_len = max_seq_length - input_len - if padding_side == "right": - input_ids = input_ids + [tokenizer.eos_token_id] * pad_len - labels = labels + [IGNORE_INDEX] * pad_len - attention_mask = [1] * input_len + [0] * pad_len + while words_to_cut > 0 and len(sequences) > 0: + words_to_cut -= len(sequences[0]) + sequences = sequences[1:] + return sequences + + def tokenize_by_neural_chat(self, examples): + """ + Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225 + The only differences are: + - using our own prompt style + - add left or right padding and truncation + - add mask_input and mask_response + """ + keys = list(examples.data.keys()) + if len(keys) != 2: + raise ValueError("Unsupported dataset format") + assistant_tokens = self.tokenizer.tokenize(self.response) + header = self.intro + self.end + "\n" + + examples["input_ids"] = [] + examples["labels"] = [] + examples["attention_mask"] = [] + for instruction, response in zip(examples[keys[0]], examples[keys[1]]): + convs = re.findall( + r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end), + instruction, + re.DOTALL, + ) + convs_tokens = [ + self.tokenizer.tokenize(conv) + self.tokenizer.tokenize("\n") for conv in convs + ] + header_tokens = self.tokenizer.tokenize(header) + self.tokenizer.tokenize("\n") + max_input = self.max_source_length - len(header_tokens) - len(assistant_tokens) + truncated_convs = self.__truncate_sequences(convs_tokens, max_input) + if len(truncated_convs) == 0: + truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]] + + prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens] + prompt_ids = [ + self.tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens + ] + prompt_ids = list(chain(*prompt_ids)) + + resp_ids = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(response.strip()) + ) + # keep last and eos_id + max_resp = self.max_seq_length - len(prompt_ids) - 1 + + # truncating response + if len(resp_ids) > max_resp: + if self.truncation_side == "right": + resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:] else: - input_ids = [tokenizer.eos_token_id] * pad_len + input_ids - labels = [IGNORE_INDEX] * pad_len + labels - attention_mask = [0] * pad_len + [1] * input_len - - assert len(input_ids) == max_seq_length - assert len(prompt_ids) <= max_source_length - assert len(labels) == len(input_ids) == len(attention_mask) - - examples["input_ids"].append(torch.tensor(input_ids)) - examples["labels"].append(labels) - examples["attention_mask"].append(attention_mask) - - return examples - - def preprocess_function_encode_inputs(examples): - keys = list(examples.data.keys()) - if len(keys) != 2: - raise ValueError("Unsupported dataset format") - - examples["input_ids"] = [] - examples["labels"] = [] - examples["attention_mask"] = [] - for s, t in zip(examples[keys[0]], examples[keys[1]]): - results = tokenizer( - s + t, - padding=padding, - truncation=truncation, + resp_ids = resp_ids[-max_resp:] + + # masking + input_ids = prompt_ids + resp_ids + [self.tokenizer.eos_token_id] + if self.mask_input: + labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [self.tokenizer.eos_token_id] + elif self.mask_response: + labels = prompt_ids + [IGNORE_INDEX] * len(resp_ids) + [self.tokenizer.eos_token_id] + else: + labels = input_ids + + # padding + input_len = len(input_ids) + pad_len = self.max_seq_length - input_len + if self.padding_side == "right": + input_ids = input_ids + [self.tokenizer.eos_token_id] * pad_len + labels = labels + [IGNORE_INDEX] * pad_len + attention_mask = [1] * input_len + [0] * pad_len + else: + input_ids = [self.tokenizer.eos_token_id] * pad_len + input_ids + labels = [IGNORE_INDEX] * pad_len + labels + attention_mask = [0] * pad_len + [1] * input_len + + assert len(input_ids) == self.max_seq_length + assert len(prompt_ids) <= self.max_source_length + assert len(labels) == len(input_ids) == len(attention_mask) + + examples["input_ids"].append(torch.tensor(input_ids)) + examples["labels"].append(labels) + examples["attention_mask"].append(attention_mask) + + return examples + + def tokenize(self, examples): + keys = list(examples.data.keys()) + if len(keys) != 2: + raise ValueError("Unsupported dataset format") + + examples["input_ids"] = [] + examples["labels"] = [] + examples["attention_mask"] = [] + for s, t in zip(examples[keys[0]], examples[keys[1]]): + results = self.tokenizer( + s + t, + padding=self.padding, + truncation=self.truncation, + return_tensors=None, + max_length=self.max_length, + ) + + input_ids = results["input_ids"] + input_len = len(input_ids) + labels = copy.deepcopy(input_ids) + if self.mask_input or self.mask_response: + sources_tokenized = self.tokenizer( + s, + padding=False, + truncation=True, return_tensors=None, - max_length=max_length, + max_length=self.max_length, ) - - input_ids = results["input_ids"] - input_len = len(input_ids) - labels = copy.deepcopy(input_ids) - if mask_input or mask_response: - sources_tokenized = tokenizer( - s, - padding=False, - truncation=True, - return_tensors=None, - max_length=max_length, - ) - input_id_len = len(sources_tokenized["input_ids"]) - # mask input - if mask_input: - labels[:input_id_len] = [IGNORE_INDEX] * input_id_len - # mask response - if mask_response: - labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) - - examples["input_ids"].append(results["input_ids"]) - examples["labels"].append(labels) - examples["attention_mask"].append(results["attention_mask"]) - return examples - - if self.config["Dataset"].get("data_preprocess_type", "neural_chat") == "neural_chat": - return preprocess_function_with_neural_chat - - return preprocess_function_encode_inputs + input_id_len = len(sources_tokenized["input_ids"]) + # mask input + if self.mask_input: + labels[:input_id_len] = [IGNORE_INDEX] * input_id_len + # mask response + if self.mask_response: + labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len) + + examples["input_ids"].append(results["input_ids"]) + examples["labels"].append(labels) + examples["attention_mask"].append(results["attention_mask"]) + return examples diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 6bb3c017c..8c67dcb4d 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -207,16 +207,21 @@ def tokenize_dataset(config: Dict, tokenizer, dataset): block_size = config["Dataset"].get("block_size", 512) tokenizer.pad_token = tokenizer.eos_token - processor = DataProcessor(config, tokenizer.eos_token) + processor = DataProcessor(config, tokenizer) for key in dataset: prompts = processor.make_prompt(dataset[key]) dataset[key] = datasets.Dataset.from_dict(prompts) column_names = list(dataset["train"].features) - processor_fn = processor.tokenize(tokenizer) + tokenize_fn = ( + processor.tokenize_by_neural_chat + if config["Dataset"].get("data_preprocess_type", "neural_chat") == "neural_chat" + else processor.tokenize + ) + tokenized_dataset = dataset.map( - processor_fn, + tokenize_fn, remove_columns=column_names, batched=True, load_from_cache_file=False,