Skip to content

Commit

Permalink
fix: Do not add special tokens to vocab, as then they won't count as …
Browse files Browse the repository at this point in the history
…special tokens
  • Loading branch information
saattrupdan committed Dec 7, 2023
1 parent 4c0d09a commit 5d15643
Showing 1 changed file with 1 addition and 3 deletions.
4 changes: 1 addition & 3 deletions src/coral_models/wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def load_processor(self) -> Wav2Vec2Processor:
dump_vocabulary(self.cfg)
tokenizer: Wav2Vec2CTCTokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
self.cfg.model_dir,
unk_token="<unk>",
pad_token="<pad>",
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
word_delimiter_token="|",
Expand Down Expand Up @@ -310,8 +310,6 @@ def dump_vocabulary(cfg: DictConfig) -> None:

# Build vocabulary
vocab = {char: idx for idx, char in enumerate(unique_characters)}
for tok in ["<unk>", "<pad>", "<s>", "</s>"]:
vocab[tok] = len(vocab)

# Dump the vocabulary to a json file
model_dir = Path(cfg.model_dir)
Expand Down

0 comments on commit 5d15643

Please sign in to comment.