From afeff61d5844af3873c89c45fe28e807fde3cffa Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Wed, 12 Jun 2024 16:28:02 +0000
Subject: [PATCH] try to fix evaluation problem

---
 llm_on_ray/finetune/finetune.py |  5 ++++-
 llm_on_ray/finetune/template.py | 12 ++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index eb4996cb5..28066c5ff 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -234,7 +234,10 @@ def prompt(rec):
         column_names += [template.TEXT_COLUMN_NAME]
 
     def tokenize_function(examples):
-        return tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length)
+        results = tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length)
+        results["input_ids"].append(tokenizer.eos_token_id)
+        results["attention_mask"].append(1)
+        return results
 
     tokenized_dataset = dataset.map(
         tokenize_function,
diff --git a/llm_on_ray/finetune/template.py b/llm_on_ray/finetune/template.py
index cf8647d7f..10ac03bf8 100644
--- a/llm_on_ray/finetune/template.py
+++ b/llm_on_ray/finetune/template.py
@@ -28,16 +28,12 @@
 {instruction_key}
 {instruction}
 
-{response_key}
-{response}
-
-{end_key}""".format(
+{response_key}{response}""".format(
     intro=INTRO_BLURB,
     instruction_key=INSTRUCTION_KEY,
     instruction="{instruction}",
     response_key=RESPONSE_KEY,
     response="{response}",
-    end_key=END_KEY,
 )
 
 PROMPT_WITH_INPUT_FORMAT = """{intro}
@@ -48,10 +44,7 @@
 {input_key}
 {input}
 
-{response_key}
-{response}
-
-{end_key}""".format(
+{response_key}{response}""".format(
     intro=INTRO_BLURB,
     instruction_key=INSTRUCTION_KEY,
     instruction="{instruction}",
@@ -59,6 +52,5 @@
     input="{input}",
     response_key=RESPONSE_KEY,
     response="{response}",
-    end_key=END_KEY,
 )
 TEXT_COLUMN_NAME = "text"