Skip to content

Commit

Permalink
chore: Deal with word delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Dec 5, 2023
1 parent e6f3f43 commit 5e20bd4
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 4 deletions.
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dirs:
seed: 4242

# Dataset parameters
characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü '
characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü'
max_seconds_per_example: 10
dataloader_num_workers: 8

Expand Down
2 changes: 1 addition & 1 deletion src/coral_models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def clean_dataset(
# transcriptions, as they do not have an influence on the pronunciation of the
# words.
non_standard_characters_regex = re.compile(
f"[^{re.escape(cfg.characters_to_keep)}]"
f"[^{re.escape(cfg.characters_to_keep + ' ')}]"
)

mapped = dataset.map(
Expand Down
6 changes: 4 additions & 2 deletions src/coral_models/wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def load_processor(self) -> Wav2Vec2Processor:
pad_token="<pad>",
bos_token="<s>",
eos_token="</s>",
word_delimiter_token=" ",
word_delimiter_token="|",
replace_word_delimiter_char=" ",
)
break
except json.decoder.JSONDecodeError:
Expand All @@ -156,6 +157,7 @@ def load_processor(self) -> Wav2Vec2Processor:
self.processor = Wav2Vec2Processor(
feature_extractor=extractor, tokenizer=tokenizer
)

return self.processor

def load_model(self) -> Wav2Vec2ForCTC:
Expand All @@ -180,7 +182,7 @@ def load_model(self) -> Wav2Vec2ForCTC:
vocab_size=len(self.processor.tokenizer.get_vocab()),
ctc_zero_infinity=True,
)
assert isinstance(model, Wav2Vec2ForCTC)
assert isinstance(model, Wav2Vec2ForCTC)

if self.cfg.model.freeze_feature_encoder:
for param in model.wav2vec2.parameters():
Expand Down

0 comments on commit 5e20bd4

Please sign in to comment.