diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index 2944d853..f49ba47b 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -128,8 +128,8 @@ def load_processor(self) -> Wav2Vec2Processor: dump_vocabulary(self.cfg) tokenizer: Wav2Vec2CTCTokenizer = Wav2Vec2CTCTokenizer.from_pretrained( self.cfg.model_dir, - unk_token="", pad_token="", + unk_token="", bos_token="", eos_token="", word_delimiter_token="|", @@ -310,8 +310,6 @@ def dump_vocabulary(cfg: DictConfig) -> None: # Build vocabulary vocab = {char: idx for idx, char in enumerate(unique_characters)} - for tok in ["", "", "", ""]: - vocab[tok] = len(vocab) # Dump the vocabulary to a json file model_dir = Path(cfg.model_dir)