From 55fd871cef7399ec24ac58971564ebef90798263 Mon Sep 17 00:00:00 2001
From: "yusuf.cakmak" <yusuf.cakmak@trendyol.com>
Date: Wed, 22 Nov 2023 13:58:37 +0300
Subject: [PATCH] Added control to tokenizer for pad_token

---
 scripts/training/run_clm_sft_with_peft.py | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 052744d..7e5a169 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -53,6 +53,7 @@
 
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 
+DEFAULT_PAD_TOKEN = "<pad>"
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -340,6 +341,10 @@ def main():
     if (len(tokenizer)) != 55296:
         raise ValueError(f"The vocab size of the tokenizer should be 55296, but found {len(tokenizer)}.\n"
                          "Please use Chinese-LLaMA-2 tokenizer.")
+    
+    if tokenizer.pad_token is None:
+        print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
+        tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
 
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
     eval_dataset=None