From afeff61d5844af3873c89c45fe28e807fde3cffa Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Wed, 12 Jun 2024 16:28:02 +0000 Subject: [PATCH] try to fix evaluation problem --- llm_on_ray/finetune/finetune.py | 5 ++++- llm_on_ray/finetune/template.py | 12 ++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index eb4996cb5..28066c5ff 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -234,7 +234,10 @@ def prompt(rec): column_names += [template.TEXT_COLUMN_NAME] def tokenize_function(examples): - return tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length) + results = tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length) + results["input_ids"].append(tokenizer.eos_token_id) + results["attention_mask"].append(1) + return results tokenized_dataset = dataset.map( tokenize_function, diff --git a/llm_on_ray/finetune/template.py b/llm_on_ray/finetune/template.py index cf8647d7f..10ac03bf8 100644 --- a/llm_on_ray/finetune/template.py +++ b/llm_on_ray/finetune/template.py @@ -28,16 +28,12 @@ {instruction_key} {instruction} -{response_key} -{response} - -{end_key}""".format( +{response_key}{response}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY, response="{response}", - end_key=END_KEY, ) PROMPT_WITH_INPUT_FORMAT = """{intro} @@ -48,10 +44,7 @@ {input_key} {input} -{response_key} -{response} - -{end_key}""".format( +{response_key}{response}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", @@ -59,6 +52,5 @@ input="{input}", response_key=RESPONSE_KEY, response="{response}", - end_key=END_KEY, ) TEXT_COLUMN_NAME = "text"