From 55fd871cef7399ec24ac58971564ebef90798263 Mon Sep 17 00:00:00 2001 From: "yusuf.cakmak" Date: Wed, 22 Nov 2023 13:58:37 +0300 Subject: [PATCH] Added control to tokenizer for pad_token --- scripts/training/run_clm_sft_with_peft.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 052744d..7e5a169 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -53,6 +53,7 @@ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR +DEFAULT_PAD_TOKEN = "" require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -340,6 +341,10 @@ def main(): if (len(tokenizer)) != 55296: raise ValueError(f"The vocab size of the tokenizer should be 55296, but found {len(tokenizer)}.\n" "Please use Chinese-LLaMA-2 tokenizer.") + + if tokenizer.pad_token is None: + print(f"Adding pad token {DEFAULT_PAD_TOKEN}") + tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN)) data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) eval_dataset=None