From afdc325e9203afb35ec2ecbd4a393475bd927f50 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 09:51:21 +0100 Subject: [PATCH 01/20] chore: Update configs --- config/config.yaml | 6 +++--- config/model/wav2vec2.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index ec8017b8..24401896 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -51,9 +51,9 @@ learning_rate: 3e-5 adam_first_momentum: 0.9 adam_second_momentum: 0.98 total_batch_size: 256 -per_device_batch_size: 16 -max_steps: 50_000 -warmup_steps: 1_000 +per_device_batch_size: 64 +max_steps: 120_000 +warmup_steps: 10_000 logging_steps: 10 eval_steps: 100 save_steps: 100 diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml index 768954ca..45b5b79e 100644 --- a/config/model/wav2vec2.yaml +++ b/config/model/wav2vec2.yaml @@ -14,9 +14,9 @@ hidden_dropout: 0.0 feat_proj_dropout: 0.0 feat_quantizer_dropout: 0.0 final_dropout: 0.0 -mask_time_prob: 0.5 +mask_time_prob: 0.3 mask_time_length: 10 -mask_feature_prob: 0.5 +mask_feature_prob: 0.3 mask_feature_length: 64 layerdrop: 0.1 # This will automatically be set to 0 in a multi-gpu setting ctc_loss_reduction: mean From 77bc20fd1459ab7c41a6fd9e04910536f11fd5d4 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 09:51:33 +0100 Subject: [PATCH 02/20] style: Logging, kwargs --- src/coral_models/compute_metrics.py | 2 +- src/coral_models/wav2vec2.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py index 745dfafc..65e51fa5 100644 --- a/src/coral_models/compute_metrics.py +++ b/src/coral_models/compute_metrics.py @@ -42,7 +42,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, mismatch_dim = len(vocab_size) - predictions.shape[-1] predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim))) predictions_str = tokenizer.batch_decode( - predictions, skip_special_tokens=True + sequences=predictions, skip_special_tokens=True ) # Otherwise, if we are not using a language model, we need to convert the diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index 1092f3c6..1dee9980 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -133,11 +133,12 @@ def load_processor(self) -> Wav2Vec2Processor: ) break except json.decoder.JSONDecodeError: - process_id = os.getenv("RANK", 0) - logger.warning( - f"JSONDecodeError while loading tokenizer on process {process_id}. " - "Retrying in a second." - ) + log_message = "JSONDecodeError while loading tokenizer" + process_id = os.getenv("RANK") + if process_id is not None: + log_message += f" in process {process_id}" + log_message += ". Retrying in a second." + logger.warning(log_message) time.sleep(1) # Set the `model_max_length` attribute of the tokenizer, if it hasn't been set, From 8a15b809c0c94a13f6371dab729edc575ab35be2 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 09:51:44 +0100 Subject: [PATCH 03/20] debug: Breakpoint --- src/coral_models/finetune.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coral_models/finetune.py b/src/coral_models/finetune.py index 723271bf..d9958e60 100644 --- a/src/coral_models/finetune.py +++ b/src/coral_models/finetune.py @@ -70,6 +70,7 @@ def finetune(cfg: DictConfig) -> None: model_setup: ModelSetup = load_model_setup(cfg) processor = model_setup.load_processor() + breakpoint() processor.save_pretrained(cfg.model_dir) model = model_setup.load_model() dataset = load_data(cfg) From 26ccd56d28714f08e50305684a96c2681faed35a Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 10:15:24 +0100 Subject: [PATCH 04/20] fix: Do not remove special tokens when decoding, as it prevents duplicate characters --- src/coral_models/compute_metrics.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py index 65e51fa5..b6789694 100644 --- a/src/coral_models/compute_metrics.py +++ b/src/coral_models/compute_metrics.py @@ -41,18 +41,16 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, vocab_size = tokenizer.get_vocab() mismatch_dim = len(vocab_size) - predictions.shape[-1] predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim))) - predictions_str = tokenizer.batch_decode( - sequences=predictions, skip_special_tokens=True - ) + predictions_str = tokenizer.batch_decode(sequences=predictions) # Otherwise, if we are not using a language model, we need to convert the # logits to token IDs and then decode the token IDs to get the predicted string else: pred_ids: NDArray[np.int_] = np.argmax(predictions, axis=-1) - predictions_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) + predictions_str = tokenizer.batch_decode(pred_ids) elif len(predictions.shape) == 2 and predictions.dtype == np.int_: - predictions_str = tokenizer.batch_decode(predictions, skip_special_tokens=True) + predictions_str = tokenizer.batch_decode(predictions) else: raise ValueError( @@ -67,9 +65,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, labels[labels == -100] = pad_token # Decode the ground truth labels - labels_str = tokenizer.batch_decode( - sequences=labels, skip_special_tokens=True, group_tokens=False - ) + labels_str = tokenizer.batch_decode(sequences=labels, group_tokens=False) # TEMP: Log both the predictions and the ground truth labels is_main_process = os.getenv("RANK", "0") == "0" From 309a60d7b0eeb9b4ff62ea0fe1f277123a51f6b9 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 10:15:59 +0100 Subject: [PATCH 05/20] chore: Remove breakpoint --- src/coral_models/finetune.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coral_models/finetune.py b/src/coral_models/finetune.py index d9958e60..723271bf 100644 --- a/src/coral_models/finetune.py +++ b/src/coral_models/finetune.py @@ -70,7 +70,6 @@ def finetune(cfg: DictConfig) -> None: model_setup: ModelSetup = load_model_setup(cfg) processor = model_setup.load_processor() - breakpoint() processor.save_pretrained(cfg.model_dir) model = model_setup.load_model() dataset = load_data(cfg) From bc07ea0d4f1a7d3425dd66a7df39dcbfec2dae51 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 28 Nov 2023 10:30:21 +0100 Subject: [PATCH 06/20] docs: Add note --- config/model/wav2vec2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml index 45b5b79e..5b85bbed 100644 --- a/config/model/wav2vec2.yaml +++ b/config/model/wav2vec2.yaml @@ -18,7 +18,7 @@ mask_time_prob: 0.3 mask_time_length: 10 mask_feature_prob: 0.3 mask_feature_length: 64 -layerdrop: 0.1 # This will automatically be set to 0 in a multi-gpu setting +layerdrop: 0.1 # NOTE: This will automatically be set to 0 in a multi-gpu setting ctc_loss_reduction: mean # Decoder hyperparameters From e6f3f437425d92063c6776c0f3624a1aca6c1156 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 5 Dec 2023 16:05:30 +0100 Subject: [PATCH 07/20] docs: Always print sample predictions when computing metrics --- src/coral_models/compute_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py index b6789694..71a594c9 100644 --- a/src/coral_models/compute_metrics.py +++ b/src/coral_models/compute_metrics.py @@ -67,7 +67,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, # Decode the ground truth labels labels_str = tokenizer.batch_decode(sequences=labels, group_tokens=False) - # TEMP: Log both the predictions and the ground truth labels + # Log both the predictions and the ground truth labels is_main_process = os.getenv("RANK", "0") == "0" if is_main_process: random_idx = np.random.randint(0, len(predictions_str)) From 5e20bd417a7937e8352c2cd2445f23cb58bdcfe7 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 5 Dec 2023 16:05:55 +0100 Subject: [PATCH 08/20] chore: Deal with word delimiters --- config/config.yaml | 2 +- src/coral_models/data.py | 2 +- src/coral_models/wav2vec2.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 24401896..2b55cc00 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -15,7 +15,7 @@ dirs: seed: 4242 # Dataset parameters -characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü ' +characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü' max_seconds_per_example: 10 dataloader_num_workers: 8 diff --git a/src/coral_models/data.py b/src/coral_models/data.py index a3011ce7..37ac0e69 100644 --- a/src/coral_models/data.py +++ b/src/coral_models/data.py @@ -281,7 +281,7 @@ def clean_dataset( # transcriptions, as they do not have an influence on the pronunciation of the # words. non_standard_characters_regex = re.compile( - f"[^{re.escape(cfg.characters_to_keep)}]" + f"[^{re.escape(cfg.characters_to_keep + ' ')}]" ) mapped = dataset.map( diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index 1dee9980..552769f5 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -129,7 +129,8 @@ def load_processor(self) -> Wav2Vec2Processor: pad_token="", bos_token="", eos_token="", - word_delimiter_token=" ", + word_delimiter_token="|", + replace_word_delimiter_char=" ", ) break except json.decoder.JSONDecodeError: @@ -156,6 +157,7 @@ def load_processor(self) -> Wav2Vec2Processor: self.processor = Wav2Vec2Processor( feature_extractor=extractor, tokenizer=tokenizer ) + return self.processor def load_model(self) -> Wav2Vec2ForCTC: @@ -180,7 +182,7 @@ def load_model(self) -> Wav2Vec2ForCTC: vocab_size=len(self.processor.tokenizer.get_vocab()), ctc_zero_infinity=True, ) - assert isinstance(model, Wav2Vec2ForCTC) + assert isinstance(model, Wav2Vec2ForCTC) if self.cfg.model.freeze_feature_encoder: for param in model.wav2vec2.parameters(): From 6478ee68a9818819da639e19811fa3e7950e98c6 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 5 Dec 2023 16:47:13 +0100 Subject: [PATCH 09/20] chore: Update configs --- config/model/test_wav2vec2.yaml | 19 ++++++++++--------- config/model/wav2vec2.yaml | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/config/model/test_wav2vec2.yaml b/config/model/test_wav2vec2.yaml index 44351548..9d869bc3 100644 --- a/config/model/test_wav2vec2.yaml +++ b/config/model/test_wav2vec2.yaml @@ -9,16 +9,17 @@ clean_dataset: true # Model hyperparameters sampling_rate: 16_000 activation_dropout: 0.1 -attention_dropout: 0.1 -hidden_dropout: 0.1 -feat_proj_dropout: 0.1 -final_dropout: 0.1 -mask_time_prob: 0.075 +attention_dropout: 0.0 +hidden_dropout: 0.0 +feat_proj_dropout: 0.0 +feat_quantizer_dropout: 0.0 +final_dropout: 0.0 +mask_time_prob: 0.5 mask_time_length: 10 -mask_feature_prob: 0.075 -mask_feature_length: 10 -layerdrop: 0.0 # NOTE: This parameter cannot be used in a multi-gpu setting! -ctc_loss_reduction: sum +mask_feature_prob: 0.5 +mask_feature_length: 64 +layerdrop: 0.1 # NOTE: This will automatically be set to 0 in a multi-gpu setting +ctc_loss_reduction: mean # Decoder hyperparameters language_model_decoder: null diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml index 5b85bbed..affb3ccd 100644 --- a/config/model/wav2vec2.yaml +++ b/config/model/wav2vec2.yaml @@ -14,12 +14,12 @@ hidden_dropout: 0.0 feat_proj_dropout: 0.0 feat_quantizer_dropout: 0.0 final_dropout: 0.0 -mask_time_prob: 0.3 +mask_time_prob: 0.5 mask_time_length: 10 -mask_feature_prob: 0.3 +mask_feature_prob: 0.5 mask_feature_length: 64 layerdrop: 0.1 # NOTE: This will automatically be set to 0 in a multi-gpu setting -ctc_loss_reduction: mean +ctc_loss_reduction: sum # Decoder hyperparameters language_model_decoder: ngram From ed4d97c69495c8fc5c560c2fa3d4484b73c7ce3e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 5 Dec 2023 17:42:31 +0100 Subject: [PATCH 10/20] fix: Do not hardcode max_seconds_per_example, and add | and space to characters_to_keep --- src/coral_models/data.py | 2 +- src/coral_models/wav2vec2.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/coral_models/data.py b/src/coral_models/data.py index 37ac0e69..be429e00 100644 --- a/src/coral_models/data.py +++ b/src/coral_models/data.py @@ -281,7 +281,7 @@ def clean_dataset( # transcriptions, as they do not have an influence on the pronunciation of the # words. non_standard_characters_regex = re.compile( - f"[^{re.escape(cfg.characters_to_keep + ' ')}]" + f"[^{re.escape(cfg.characters_to_keep + ' |')}]" ) mapped = dataset.map( diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index 552769f5..2944d853 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -43,6 +43,8 @@ class DataCollatorCTCWithPadding(DataCollatorMixin): Args: processor (Wav2Vec2Processor) The processor used for proccessing the data. + max_seconds_per_example (float): + The maximum number of seconds per example. padding (bool, str or PaddingStrategy, optional): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -61,6 +63,7 @@ class DataCollatorCTCWithPadding(DataCollatorMixin): processor: Wav2Vec2Processor padding: bool | str + max_seconds_per_example: float return_tensors: str = "pt" def torch_call(self, features: list[dict]) -> BatchFeature: @@ -86,12 +89,12 @@ def torch_call(self, features: list[dict]) -> BatchFeature: audio_features, padding=self.padding, return_tensors=self.return_tensors, - max_length=16_000 * 10, + max_length=16_000 * self.max_seconds_per_example, ) label_features = [dict(input_ids=feature["labels"]) for feature in features] - labels_batch: BatchEncoding = self.processor.tokenizer.pad( - label_features, + labels_batch: BatchEncoding = self.processor.pad( + labels=label_features, padding=self.padding, return_tensors=self.return_tensors, max_length=512, @@ -192,7 +195,9 @@ def load_model(self) -> Wav2Vec2ForCTC: def load_data_collator(self) -> DataCollatorCTCWithPadding: return DataCollatorCTCWithPadding( - processor=self.processor, padding=self.cfg.padding + processor=self.processor, + max_seconds_per_example=self.cfg.max_seconds_per_example, + padding=self.cfg.padding, ) def load_trainer_class(self) -> Type[Trainer]: @@ -278,7 +283,9 @@ def load_saved(self) -> PreTrainedModelData: model = Wav2Vec2ForCTC.from_pretrained(self.cfg.hub_id, token=True) data_collator = DataCollatorCTCWithPadding( - processor=processor, padding=self.cfg.padding + processor=processor, + max_seconds_per_example=self.cfg.max_seconds_per_example, + padding=self.cfg.padding, ) compute_metrics = partial(compute_wer_metrics, processor=processor) return PreTrainedModelData( @@ -299,7 +306,7 @@ def dump_vocabulary(cfg: DictConfig) -> None: The Hydra configuration object. """ # Build the set of all unique characters in the dataset - unique_characters: set[str] = set(cfg.characters_to_keep) + unique_characters: set[str] = set(cfg.characters_to_keep + "|") # Build vocabulary vocab = {char: idx for idx, char in enumerate(unique_characters)} From 627039919e43d54c4b3caa103fb49877a9f48a7f Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Thu, 7 Dec 2023 13:00:23 +0100 Subject: [PATCH 11/20] fix: Ensure that we pad with pad_token when using a LM decoder --- src/coral_models/compute_metrics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py index 71a594c9..b3283aff 100644 --- a/src/coral_models/compute_metrics.py +++ b/src/coral_models/compute_metrics.py @@ -40,7 +40,12 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, if predictions.dtype == np.int_: vocab_size = tokenizer.get_vocab() mismatch_dim = len(vocab_size) - predictions.shape[-1] - predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim))) + predictions = np.pad( + array=predictions, + pad_width=((0, 0), (0, 0), (0, mismatch_dim)), + mode="constant", + constant_values=pad_token, + ) predictions_str = tokenizer.batch_decode(sequences=predictions) # Otherwise, if we are not using a language model, we need to convert the From 4c0d09a637d31e76869f570becc4c1799e89198f Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Thu, 7 Dec 2023 13:00:45 +0100 Subject: [PATCH 12/20] fix: Ensure that pad_token is chosen when all logits for a token are -100 --- src/coral_models/compute_metrics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py index b3283aff..9e252ecf 100644 --- a/src/coral_models/compute_metrics.py +++ b/src/coral_models/compute_metrics.py @@ -51,11 +51,16 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str, # Otherwise, if we are not using a language model, we need to convert the # logits to token IDs and then decode the token IDs to get the predicted string else: + # If all the logits are -100 for a token, then we set the logit for the + # padding token for that token to 0. This is to ensure that this token gets + # decoded to a padding token, and are therefore ignored + predictions[np.all(predictions == -100, axis=-1), pad_token] = 0 + pred_ids: NDArray[np.int_] = np.argmax(predictions, axis=-1) predictions_str = tokenizer.batch_decode(pred_ids) elif len(predictions.shape) == 2 and predictions.dtype == np.int_: - predictions_str = tokenizer.batch_decode(predictions) + predictions_str = tokenizer.batch_decode(sequences=predictions) else: raise ValueError( From 5d15643f8043264bae560b2eb03c2c4004027c66 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Thu, 7 Dec 2023 13:01:38 +0100 Subject: [PATCH 13/20] fix: Do not add special tokens to vocab, as then they won't count as special tokens --- src/coral_models/wav2vec2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index 2944d853..f49ba47b 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -128,8 +128,8 @@ def load_processor(self) -> Wav2Vec2Processor: dump_vocabulary(self.cfg) tokenizer: Wav2Vec2CTCTokenizer = Wav2Vec2CTCTokenizer.from_pretrained( self.cfg.model_dir, - unk_token="", pad_token="", + unk_token="", bos_token="", eos_token="", word_delimiter_token="|", @@ -310,8 +310,6 @@ def dump_vocabulary(cfg: DictConfig) -> None: # Build vocabulary vocab = {char: idx for idx, char in enumerate(unique_characters)} - for tok in ["", "", "", ""]: - vocab[tok] = len(vocab) # Dump the vocabulary to a json file model_dir = Path(cfg.model_dir) From 4818ec82738f346c38f6fc885b3432c965f1a3ea Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Thu, 7 Dec 2023 13:02:26 +0100 Subject: [PATCH 14/20] fix: Update padding kwargs in Whisper analogous to Wav2Vec2 --- src/coral_models/whisper.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index 3ade4940..6c459097 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -77,15 +77,23 @@ def torch_call(self, features: list[dict]) -> BatchFeature: raise ValueError( "Features must contain either 'input_features' or 'audio' key." ) - batch = self.processor.feature_extractor.pad( - audio_features, return_tensors="pt" + batch = self.processor.pad( + audio_features, + padding=self.padding, + return_tensors=self.return_tensors, + max_length=16_000 * self.max_seconds_per_example, ) # Get the tokenized label sequences label_features = [{"input_ids": feature["labels"]} for feature in features] # Pad the labels to max length - labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + return_tensors=self.return_tensors, + max_length=512, + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill( From 7a9d4bb068d1485cc9408383b9803d8e954fe5ca Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Thu, 7 Dec 2023 17:15:02 +0100 Subject: [PATCH 15/20] fix: Padding with a WhisperProcessor --- src/coral_models/whisper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index 6c459097..f460d4bb 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -77,7 +77,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature: raise ValueError( "Features must contain either 'input_features' or 'audio' key." ) - batch = self.processor.pad( + batch = self.processor.feature_extractor.pad( audio_features, padding=self.padding, return_tensors=self.return_tensors, @@ -88,7 +88,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature: label_features = [{"input_ids": feature["labels"]} for feature in features] # Pad the labels to max length - labels_batch = self.processor.pad( + labels_batch = self.processor.tokenizer.pad( labels=label_features, padding=self.padding, return_tensors=self.return_tensors, From 56d617c0b08450ea0c7ff1d697a6272f87b99b3e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 13 Dec 2023 12:36:09 +0100 Subject: [PATCH 16/20] fix: Add `max_seconds_per_example` to Whisper Processor --- src/coral_models/wav2vec2.py | 2 +- src/coral_models/whisper.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py index f49ba47b..983269c1 100644 --- a/src/coral_models/wav2vec2.py +++ b/src/coral_models/wav2vec2.py @@ -62,8 +62,8 @@ class DataCollatorCTCWithPadding(DataCollatorMixin): """ processor: Wav2Vec2Processor - padding: bool | str max_seconds_per_example: float + padding: bool | str return_tensors: str = "pt" def torch_call(self, features: list[dict]) -> BatchFeature: diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index f460d4bb..1553c2d3 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -36,6 +36,8 @@ class DataCollatorSpeechSeq2SeqWithPadding(DataCollatorMixin): Args: processor (WhisperProcessor) The processor used for proccessing the data. + max_seconds_per_example (float): + The maximum number of seconds per example. padding (bool, str or PaddingStrategy, optional): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -53,6 +55,7 @@ class DataCollatorSpeechSeq2SeqWithPadding(DataCollatorMixin): """ processor: WhisperProcessor + max_seconds_per_example: float padding: bool | str = True return_tensors: str = "pt" From 61d2b04b7ae1c5f5721301ca7498b5771b9f2b93 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 13 Dec 2023 12:36:38 +0100 Subject: [PATCH 17/20] chore: Change config --- config/config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 2b55cc00..effb6745 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -47,13 +47,13 @@ ignore_data_skip: false save_total_limit: 2 # Optimisation parameters -learning_rate: 3e-5 +learning_rate: 1e-4 adam_first_momentum: 0.9 adam_second_momentum: 0.98 total_batch_size: 256 -per_device_batch_size: 64 -max_steps: 120_000 -warmup_steps: 10_000 +per_device_batch_size: 16 +max_steps: 10_000 +warmup_steps: 1_000 logging_steps: 10 eval_steps: 100 save_steps: 100 From 2f7573bc6ef5e27fe0dbadfd2c331f8c0fd145ff Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 13 Dec 2023 12:43:48 +0100 Subject: [PATCH 18/20] fix: Add max_seconds_per_example as argument to DataCollatorSpeechSeq2SeqWithPadding --- src/coral_models/whisper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index 1553c2d3..bf6372b2 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -173,7 +173,9 @@ def load_model(self) -> WhisperForConditionalGeneration: def load_data_collator(self) -> DataCollatorSpeechSeq2SeqWithPadding: return DataCollatorSpeechSeq2SeqWithPadding( - processor=self.processor, padding=self.cfg.padding + processor=self.processor, + max_seconds_per_example=self.cfg.data.max_seconds_per_example, + padding=self.cfg.padding, ) def load_trainer_class(self) -> Type[Trainer]: From 6430fd782e45cec945926cb8e1f832639aa54e93 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 13 Dec 2023 12:52:09 +0100 Subject: [PATCH 19/20] fix: Typo in config max_seconds_per_example --- src/coral_models/whisper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index bf6372b2..fa59a5b6 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -174,7 +174,7 @@ def load_model(self) -> WhisperForConditionalGeneration: def load_data_collator(self) -> DataCollatorSpeechSeq2SeqWithPadding: return DataCollatorSpeechSeq2SeqWithPadding( processor=self.processor, - max_seconds_per_example=self.cfg.data.max_seconds_per_example, + max_seconds_per_example=self.cfg.max_seconds_per_example, padding=self.cfg.padding, ) From 6b801616eaa3dcd9796af6c0d18a44113c2f8f64 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 13 Dec 2023 13:02:00 +0100 Subject: [PATCH 20/20] fix: Remove `labels` kwarg from Whisper tokenizer padding --- src/coral_models/whisper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py index fa59a5b6..a30991ff 100644 --- a/src/coral_models/whisper.py +++ b/src/coral_models/whisper.py @@ -92,7 +92,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature: # Pad the labels to max length labels_batch = self.processor.tokenizer.pad( - labels=label_features, + label_features, padding=self.padding, return_tensors=self.return_tensors, max_length=512,