From 918290775735c21b1c35398e141becbffa4f247f Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Wed, 10 Apr 2024 11:22:03 +0000
Subject: [PATCH] [Finetune] use base model mpt-7b instead of mpt-7b-chat
 (#181)

* use base model mpt-7b instead of mpt-7b-chat

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* manual setting specify tokenizer

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update doc/finetune_parameters.md

Signed-off-by: minmingzhu <minming.zhu@intel.com>

---------

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/night_build_memo.txt                      | 2 +-
 .github/workflows/workflow_finetune.yml                     | 6 +++---
 docs/finetune_parameters.md                                 | 1 +
 llm_on_ray/finetune/finetune.py                             | 6 +++++-
 llm_on_ray/finetune/finetune_config.py                      | 1 +
 .../finetune/models/{mpt-7b-chat.yaml => mpt-7b.yaml}       | 3 ++-
 6 files changed, 13 insertions(+), 6 deletions(-)
 rename llm_on_ray/finetune/models/{mpt-7b-chat.yaml => mpt-7b.yaml} (91%)

diff --git a/.github/workflows/night_build_memo.txt b/.github/workflows/night_build_memo.txt
index e5197571c..520e176e1 100644
--- a/.github/workflows/night_build_memo.txt
+++ b/.github/workflows/night_build_memo.txt
@@ -1 +1 @@
-finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b
\ No newline at end of file
+finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, huggyllama/llama-7b
\ No newline at end of file
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 76f1097a4..ddc547774 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -34,7 +34,7 @@ jobs:
     name: finetune
     strategy:
       matrix:
-        model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -92,7 +92,7 @@ jobs:
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['base_model'] = "${{ matrix.model }}"
-              if "${{ matrix.model }}" == "mosaicml/mpt-7b-chat":
+              if "${{ matrix.model }}" == "mosaicml/mpt-7b":
                   result['General']['config']['trust_remote_code'] = True
               else:
                   result['General']['config']['trust_remote_code'] = False
@@ -147,7 +147,7 @@ jobs:
 
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
-          if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1|google\/gemma-2b)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1|google\/gemma-2b)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index 531549adf..5d24f42e6 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -7,6 +7,7 @@ The following are the parameters supported in the finetuning workflow.
 |Configuration Name| Default|Meaning|
 |-|-|-|
 |base_model| EleutherAI/gpt-j-6b|Path to pretrained model or model identifier from huggingface.co/models|
+|tokenizer_name|None|Path to pretrained tokenizer from huggingface.co/models. If not provided, the tokenizer will be loaded from the `base_model`.|
 |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index b31a5f01d..0f9e96f96 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -155,6 +155,10 @@ def train_func(config: Dict[str, Any]):
 
     gradient_accumulation_steps = config["Training"].get("gradient_accumulation_steps", 1)
     base_model = config["General"]["base_model"]
+    if config["General"].get("tokenizer_name") is not None:
+        tokenizer_name = config["General"].get("tokenizer_name")
+    else:
+        tokenizer_name = base_model
     dataset_file = config["Dataset"]["train_file"]
 
     seed = config["Training"].get("seed")
@@ -171,7 +175,7 @@ def train_func(config: Dict[str, Any]):
 
     tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(
         config={
-            "name": base_model,
+            "name": tokenizer_name,
             "config": config["General"]["config"],
         }
     )
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 8f5f6ed6f..a01095c16 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -52,6 +52,7 @@ class DeltatunerConfig(BaseModel):
 
 class General(BaseModel):
     base_model: str
+    tokenizer_name: Optional[str] = None
     gpt_base_model: bool
     output_dir: str
     checkpoint_dir: Optional[str]
diff --git a/llm_on_ray/finetune/models/mpt-7b-chat.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml
similarity index 91%
rename from llm_on_ray/finetune/models/mpt-7b-chat.yaml
rename to llm_on_ray/finetune/models/mpt-7b.yaml
index b4644194f..067a093a2 100644
--- a/llm_on_ray/finetune/models/mpt-7b-chat.yaml
+++ b/llm_on_ray/finetune/models/mpt-7b.yaml
@@ -1,5 +1,6 @@
 General:
-  base_model: mosaicml/mpt-7b-chat
+  base_model: mosaicml/mpt-7b
+  tokenizer_name: EleutherAI/gpt-neox-20b
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint