fix: Do not add special tokens to vocab, as then they won't count as …

…special tokens
alexandrainst · Dec 7, 2023 · 5d15643 · 5d15643
1 parent 4c0d09a
commit 5d15643
Showing 1 changed file with 1 addition and 3 deletions.
diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
@@ -128,8 +128,8 @@ def load_processor(self) -> Wav2Vec2Processor:
                 dump_vocabulary(self.cfg)
                 tokenizer: Wav2Vec2CTCTokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                     self.cfg.model_dir,
-                    unk_token="<unk>",
                     pad_token="<pad>",
+                    unk_token="<unk>",
                     bos_token="<s>",
                     eos_token="</s>",
                     word_delimiter_token="|",
@@ -310,8 +310,6 @@ def dump_vocabulary(cfg: DictConfig) -> None:
 
     # Build vocabulary
     vocab = {char: idx for idx, char in enumerate(unique_characters)}
-    for tok in ["<unk>", "<pad>", "<s>", "</s>"]:
-        vocab[tok] = len(vocab)
 
     # Dump the vocabulary to a json file
     model_dir = Path(cfg.model_dir)