Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Finetune] Fix evaluation #252

Merged
merged 15 commits into from
Jun 28, 2024
Prev Previous commit
Next Next commit
fix comments
Signed-off-by: minmingzhu <minming.zhu@intel.com>
  • Loading branch information
minmingzhu committed Jun 26, 2024
commit a633a134b9192ee4387f9b39b5f29e9a93d753f8
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@
IGNORE_INDEX = -100


class DataPreprocess:
class DataProcessor:
# We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
def __init__(self, config, eos_token):
self.config = config
@@ -33,7 +33,7 @@ def __init__(self, config, eos_token):
self.input = "### Input:\n"
self.response = "### Response:\n"

def prompt(self, examples):
def make_prompt(self, examples):
prompts = {}
prompts["prompt_sources"] = []
prompts["prompt_targets"] = []
@@ -110,18 +110,14 @@ def preprocess_function_with_neural_chat(examples):
if len(keys) != 2:
raise ValueError("Unsupported dataset format")
assistant_tokens = tokenizer.tokenize(self.response)
header = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
+ self.end
+ "\n"
)
header = self.intro + self.end + "\n"

examples["input_ids"] = []
examples["labels"] = []
examples["attention_mask"] = []
for instruction, response in zip(examples[keys[0]], examples[keys[1]]):
convs = re.findall(
r"### Instruction.*?{0}|### Response.*?{0}".format(self.end),
r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end),
instruction,
re.DOTALL,
)
10 changes: 5 additions & 5 deletions llm_on_ray/finetune/finetune.py
Original file line number Diff line number Diff line change
@@ -40,7 +40,7 @@
from pydantic_yaml import parse_yaml_raw_as

from llm_on_ray import common
from llm_on_ray.finetune.data_preprocess import DataPreprocess
from llm_on_ray.finetune.data_process import DataProcessor
from llm_on_ray.finetune.finetune_config import FinetuneConfig


@@ -207,16 +207,16 @@ def tokenize_dataset(config: Dict, tokenizer, dataset):
block_size = config["Dataset"].get("block_size", 512)
tokenizer.pad_token = tokenizer.eos_token

preprocess = DataPreprocess(config, tokenizer.eos_token)
processor = DataProcessor(config, tokenizer.eos_token)

for key in dataset:
prompts = preprocess.prompt(dataset[key])
prompts = processor.make_prompt(dataset[key])
dataset[key] = datasets.Dataset.from_dict(prompts)

column_names = list(dataset["train"].features)
preprocess_fn = preprocess.tokenize(tokenizer)
processor_fn = processor.tokenize(tokenizer)
tokenized_dataset = dataset.map(
preprocess_fn,
processor_fn,
remove_columns=column_names,
batched=True,
load_from_cache_file=False,