fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>
intel · minmingzhu · Jun 28, 2024 · Jun 14, 2024 · Jun 16, 2024 · Jun 16, 2024
commit a633a134b9192ee4387f9b39b5f29e9a93d753f8
diff --git a/llm_on_ray/finetune/data_preprocess.py → llm_on_ray/finetune/data_process.py b/llm_on_ray/finetune/data_preprocess.py → llm_on_ray/finetune/data_process.py
@@ -23,7 +23,7 @@
 IGNORE_INDEX = -100
 
 
-class DataPreprocess:
+class DataProcessor:
     # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
     def __init__(self, config, eos_token):
         self.config = config
@@ -33,7 +33,7 @@ def __init__(self, config, eos_token):
         self.input = "### Input:\n"
         self.response = "### Response:\n"
 
-    def prompt(self, examples):
+    def make_prompt(self, examples):
         prompts = {}
         prompts["prompt_sources"] = []
         prompts["prompt_targets"] = []
@@ -110,18 +110,14 @@ def preprocess_function_with_neural_chat(examples):
             if len(keys) != 2:
                 raise ValueError("Unsupported dataset format")
             assistant_tokens = tokenizer.tokenize(self.response)
-            header = (
-                "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-                + self.end
-                + "\n"
-            )
+            header = self.intro + self.end + "\n"
 
             examples["input_ids"] = []
             examples["labels"] = []
             examples["attention_mask"] = []
             for instruction, response in zip(examples[keys[0]], examples[keys[1]]):
                 convs = re.findall(
-                    r"### Instruction.*?{0}|### Response.*?{0}".format(self.end),
+                    r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end),
                     instruction,
                     re.DOTALL,
                 )

diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
@@ -40,7 +40,7 @@
 from pydantic_yaml import parse_yaml_raw_as
 
 from llm_on_ray import common
-from llm_on_ray.finetune.data_preprocess import DataPreprocess
+from llm_on_ray.finetune.data_process import DataProcessor
 from llm_on_ray.finetune.finetune_config import FinetuneConfig
 
 
@@ -207,16 +207,16 @@ def tokenize_dataset(config: Dict, tokenizer, dataset):
     block_size = config["Dataset"].get("block_size", 512)
     tokenizer.pad_token = tokenizer.eos_token
 
-    preprocess = DataPreprocess(config, tokenizer.eos_token)
+    processor = DataProcessor(config, tokenizer.eos_token)
 
     for key in dataset:
-        prompts = preprocess.prompt(dataset[key])
+        prompts = processor.make_prompt(dataset[key])
         dataset[key] = datasets.Dataset.from_dict(prompts)
 
     column_names = list(dataset["train"].features)
-    preprocess_fn = preprocess.tokenize(tokenizer)
+    processor_fn = processor.tokenize(tokenizer)
     tokenized_dataset = dataset.map(
-        preprocess_fn,
+        processor_fn,
         remove_columns=column_names,
         batched=True,
         load_from_cache_file=False,