From afdc325e9203afb35ec2ecbd4a393475bd927f50 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 09:51:21 +0100
Subject: [PATCH 01/20] chore: Update configs

---
 config/config.yaml         | 6 +++---
 config/model/wav2vec2.yaml | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index ec8017b8..24401896 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -51,9 +51,9 @@ learning_rate: 3e-5
 adam_first_momentum: 0.9
 adam_second_momentum: 0.98
 total_batch_size: 256
-per_device_batch_size: 16
-max_steps: 50_000
-warmup_steps: 1_000
+per_device_batch_size: 64
+max_steps: 120_000
+warmup_steps: 10_000
 logging_steps: 10
 eval_steps: 100
 save_steps: 100
diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml
index 768954ca..45b5b79e 100644
--- a/config/model/wav2vec2.yaml
+++ b/config/model/wav2vec2.yaml
@@ -14,9 +14,9 @@ hidden_dropout: 0.0
 feat_proj_dropout: 0.0
 feat_quantizer_dropout: 0.0
 final_dropout: 0.0
-mask_time_prob: 0.5
+mask_time_prob: 0.3
 mask_time_length: 10
-mask_feature_prob: 0.5
+mask_feature_prob: 0.3
 mask_feature_length: 64
 layerdrop: 0.1  # This will automatically be set to 0 in a multi-gpu setting
 ctc_loss_reduction: mean

From 77bc20fd1459ab7c41a6fd9e04910536f11fd5d4 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 09:51:33 +0100
Subject: [PATCH 02/20] style: Logging, kwargs

---
 src/coral_models/compute_metrics.py |  2 +-
 src/coral_models/wav2vec2.py        | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py
index 745dfafc..65e51fa5 100644
--- a/src/coral_models/compute_metrics.py
+++ b/src/coral_models/compute_metrics.py
@@ -42,7 +42,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
             mismatch_dim = len(vocab_size) - predictions.shape[-1]
             predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim)))
             predictions_str = tokenizer.batch_decode(
-                predictions, skip_special_tokens=True
+                sequences=predictions, skip_special_tokens=True
             )
 
         # Otherwise, if we are not using a language model, we need to convert the
diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
index 1092f3c6..1dee9980 100644
--- a/src/coral_models/wav2vec2.py
+++ b/src/coral_models/wav2vec2.py
@@ -133,11 +133,12 @@ def load_processor(self) -> Wav2Vec2Processor:
                 )
                 break
             except json.decoder.JSONDecodeError:
-                process_id = os.getenv("RANK", 0)
-                logger.warning(
-                    f"JSONDecodeError while loading tokenizer on process {process_id}. "
-                    "Retrying in a second."
-                )
+                log_message = "JSONDecodeError while loading tokenizer"
+                process_id = os.getenv("RANK")
+                if process_id is not None:
+                    log_message += f" in process {process_id}"
+                log_message += ". Retrying in a second."
+                logger.warning(log_message)
                 time.sleep(1)
 
         # Set the `model_max_length` attribute of the tokenizer, if it hasn't been set,

From 8a15b809c0c94a13f6371dab729edc575ab35be2 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 09:51:44 +0100
Subject: [PATCH 03/20] debug: Breakpoint

---
 src/coral_models/finetune.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/coral_models/finetune.py b/src/coral_models/finetune.py
index 723271bf..d9958e60 100644
--- a/src/coral_models/finetune.py
+++ b/src/coral_models/finetune.py
@@ -70,6 +70,7 @@ def finetune(cfg: DictConfig) -> None:
 
     model_setup: ModelSetup = load_model_setup(cfg)
     processor = model_setup.load_processor()
+    breakpoint()
     processor.save_pretrained(cfg.model_dir)
     model = model_setup.load_model()
     dataset = load_data(cfg)

From 26ccd56d28714f08e50305684a96c2681faed35a Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 10:15:24 +0100
Subject: [PATCH 04/20] fix: Do not remove special tokens when decoding, as it
 prevents duplicate characters

---
 src/coral_models/compute_metrics.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py
index 65e51fa5..b6789694 100644
--- a/src/coral_models/compute_metrics.py
+++ b/src/coral_models/compute_metrics.py
@@ -41,18 +41,16 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
             vocab_size = tokenizer.get_vocab()
             mismatch_dim = len(vocab_size) - predictions.shape[-1]
             predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim)))
-            predictions_str = tokenizer.batch_decode(
-                sequences=predictions, skip_special_tokens=True
-            )
+            predictions_str = tokenizer.batch_decode(sequences=predictions)
 
         # Otherwise, if we are not using a language model, we need to convert the
         # logits to token IDs and then decode the token IDs to get the predicted string
         else:
             pred_ids: NDArray[np.int_] = np.argmax(predictions, axis=-1)
-            predictions_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+            predictions_str = tokenizer.batch_decode(pred_ids)
 
     elif len(predictions.shape) == 2 and predictions.dtype == np.int_:
-        predictions_str = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+        predictions_str = tokenizer.batch_decode(predictions)
 
     else:
         raise ValueError(
@@ -67,9 +65,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
     labels[labels == -100] = pad_token
 
     # Decode the ground truth labels
-    labels_str = tokenizer.batch_decode(
-        sequences=labels, skip_special_tokens=True, group_tokens=False
-    )
+    labels_str = tokenizer.batch_decode(sequences=labels, group_tokens=False)
 
     # TEMP: Log both the predictions and the ground truth labels
     is_main_process = os.getenv("RANK", "0") == "0"

From 309a60d7b0eeb9b4ff62ea0fe1f277123a51f6b9 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 10:15:59 +0100
Subject: [PATCH 05/20] chore: Remove breakpoint

---
 src/coral_models/finetune.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/coral_models/finetune.py b/src/coral_models/finetune.py
index d9958e60..723271bf 100644
--- a/src/coral_models/finetune.py
+++ b/src/coral_models/finetune.py
@@ -70,7 +70,6 @@ def finetune(cfg: DictConfig) -> None:
 
     model_setup: ModelSetup = load_model_setup(cfg)
     processor = model_setup.load_processor()
-    breakpoint()
     processor.save_pretrained(cfg.model_dir)
     model = model_setup.load_model()
     dataset = load_data(cfg)

From bc07ea0d4f1a7d3425dd66a7df39dcbfec2dae51 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 28 Nov 2023 10:30:21 +0100
Subject: [PATCH 06/20] docs: Add note

---
 config/model/wav2vec2.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml
index 45b5b79e..5b85bbed 100644
--- a/config/model/wav2vec2.yaml
+++ b/config/model/wav2vec2.yaml
@@ -18,7 +18,7 @@ mask_time_prob: 0.3
 mask_time_length: 10
 mask_feature_prob: 0.3
 mask_feature_length: 64
-layerdrop: 0.1  # This will automatically be set to 0 in a multi-gpu setting
+layerdrop: 0.1  # NOTE: This will automatically be set to 0 in a multi-gpu setting
 ctc_loss_reduction: mean
 
 # Decoder hyperparameters

From e6f3f437425d92063c6776c0f3624a1aca6c1156 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 5 Dec 2023 16:05:30 +0100
Subject: [PATCH 07/20] docs: Always print sample predictions when computing
 metrics

---
 src/coral_models/compute_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py
index b6789694..71a594c9 100644
--- a/src/coral_models/compute_metrics.py
+++ b/src/coral_models/compute_metrics.py
@@ -67,7 +67,7 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
     # Decode the ground truth labels
     labels_str = tokenizer.batch_decode(sequences=labels, group_tokens=False)
 
-    # TEMP: Log both the predictions and the ground truth labels
+    # Log both the predictions and the ground truth labels
     is_main_process = os.getenv("RANK", "0") == "0"
     if is_main_process:
         random_idx = np.random.randint(0, len(predictions_str))

From 5e20bd417a7937e8352c2cd2445f23cb58bdcfe7 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 5 Dec 2023 16:05:55 +0100
Subject: [PATCH 08/20] chore: Deal with word delimiters

---
 config/config.yaml           | 2 +-
 src/coral_models/data.py     | 2 +-
 src/coral_models/wav2vec2.py | 6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 24401896..2b55cc00 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -15,7 +15,7 @@ dirs:
 seed: 4242
 
 # Dataset parameters
-characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü '
+characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü'
 max_seconds_per_example: 10
 dataloader_num_workers: 8
 
diff --git a/src/coral_models/data.py b/src/coral_models/data.py
index a3011ce7..37ac0e69 100644
--- a/src/coral_models/data.py
+++ b/src/coral_models/data.py
@@ -281,7 +281,7 @@ def clean_dataset(
     # transcriptions, as they do not have an influence on the pronunciation of the
     # words.
     non_standard_characters_regex = re.compile(
-        f"[^{re.escape(cfg.characters_to_keep)}]"
+        f"[^{re.escape(cfg.characters_to_keep + ' ')}]"
     )
 
     mapped = dataset.map(
diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
index 1dee9980..552769f5 100644
--- a/src/coral_models/wav2vec2.py
+++ b/src/coral_models/wav2vec2.py
@@ -129,7 +129,8 @@ def load_processor(self) -> Wav2Vec2Processor:
                     pad_token="<pad>",
                     bos_token="<s>",
                     eos_token="</s>",
-                    word_delimiter_token=" ",
+                    word_delimiter_token="|",
+                    replace_word_delimiter_char=" ",
                 )
                 break
             except json.decoder.JSONDecodeError:
@@ -156,6 +157,7 @@ def load_processor(self) -> Wav2Vec2Processor:
         self.processor = Wav2Vec2Processor(
             feature_extractor=extractor, tokenizer=tokenizer
         )
+
         return self.processor
 
     def load_model(self) -> Wav2Vec2ForCTC:
@@ -180,7 +182,7 @@ def load_model(self) -> Wav2Vec2ForCTC:
                 vocab_size=len(self.processor.tokenizer.get_vocab()),
                 ctc_zero_infinity=True,
             )
-            assert isinstance(model, Wav2Vec2ForCTC)
+        assert isinstance(model, Wav2Vec2ForCTC)
 
         if self.cfg.model.freeze_feature_encoder:
             for param in model.wav2vec2.parameters():

From 6478ee68a9818819da639e19811fa3e7950e98c6 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 5 Dec 2023 16:47:13 +0100
Subject: [PATCH 09/20] chore: Update configs

---
 config/model/test_wav2vec2.yaml | 19 ++++++++++---------
 config/model/wav2vec2.yaml      |  6 +++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/config/model/test_wav2vec2.yaml b/config/model/test_wav2vec2.yaml
index 44351548..9d869bc3 100644
--- a/config/model/test_wav2vec2.yaml
+++ b/config/model/test_wav2vec2.yaml
@@ -9,16 +9,17 @@ clean_dataset: true
 # Model hyperparameters
 sampling_rate: 16_000
 activation_dropout: 0.1
-attention_dropout: 0.1
-hidden_dropout: 0.1
-feat_proj_dropout: 0.1
-final_dropout: 0.1
-mask_time_prob: 0.075
+attention_dropout: 0.0
+hidden_dropout: 0.0
+feat_proj_dropout: 0.0
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+mask_time_prob: 0.5
 mask_time_length: 10
-mask_feature_prob: 0.075
-mask_feature_length: 10
-layerdrop: 0.0  # NOTE: This parameter cannot be used in a multi-gpu setting!
-ctc_loss_reduction: sum
+mask_feature_prob: 0.5
+mask_feature_length: 64
+layerdrop: 0.1  # NOTE: This will automatically be set to 0 in a multi-gpu setting
+ctc_loss_reduction: mean
 
 # Decoder hyperparameters
 language_model_decoder: null
diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml
index 5b85bbed..affb3ccd 100644
--- a/config/model/wav2vec2.yaml
+++ b/config/model/wav2vec2.yaml
@@ -14,12 +14,12 @@ hidden_dropout: 0.0
 feat_proj_dropout: 0.0
 feat_quantizer_dropout: 0.0
 final_dropout: 0.0
-mask_time_prob: 0.3
+mask_time_prob: 0.5
 mask_time_length: 10
-mask_feature_prob: 0.3
+mask_feature_prob: 0.5
 mask_feature_length: 64
 layerdrop: 0.1  # NOTE: This will automatically be set to 0 in a multi-gpu setting
-ctc_loss_reduction: mean
+ctc_loss_reduction: sum
 
 # Decoder hyperparameters
 language_model_decoder: ngram

From ed4d97c69495c8fc5c560c2fa3d4484b73c7ce3e Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Tue, 5 Dec 2023 17:42:31 +0100
Subject: [PATCH 10/20] fix: Do not hardcode max_seconds_per_example, and add |
 and space to characters_to_keep

---
 src/coral_models/data.py     |  2 +-
 src/coral_models/wav2vec2.py | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/coral_models/data.py b/src/coral_models/data.py
index 37ac0e69..be429e00 100644
--- a/src/coral_models/data.py
+++ b/src/coral_models/data.py
@@ -281,7 +281,7 @@ def clean_dataset(
     # transcriptions, as they do not have an influence on the pronunciation of the
     # words.
     non_standard_characters_regex = re.compile(
-        f"[^{re.escape(cfg.characters_to_keep + ' ')}]"
+        f"[^{re.escape(cfg.characters_to_keep + ' |')}]"
     )
 
     mapped = dataset.map(
diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
index 552769f5..2944d853 100644
--- a/src/coral_models/wav2vec2.py
+++ b/src/coral_models/wav2vec2.py
@@ -43,6 +43,8 @@ class DataCollatorCTCWithPadding(DataCollatorMixin):
     Args:
         processor (Wav2Vec2Processor)
             The processor used for proccessing the data.
+        max_seconds_per_example (float):
+            The maximum number of seconds per example.
         padding (bool, str or PaddingStrategy, optional):
             Select a strategy to pad the returned sequences (according to the model's
             padding side and padding index) among:
@@ -61,6 +63,7 @@ class DataCollatorCTCWithPadding(DataCollatorMixin):
 
     processor: Wav2Vec2Processor
     padding: bool | str
+    max_seconds_per_example: float
     return_tensors: str = "pt"
 
     def torch_call(self, features: list[dict]) -> BatchFeature:
@@ -86,12 +89,12 @@ def torch_call(self, features: list[dict]) -> BatchFeature:
             audio_features,
             padding=self.padding,
             return_tensors=self.return_tensors,
-            max_length=16_000 * 10,
+            max_length=16_000 * self.max_seconds_per_example,
         )
 
         label_features = [dict(input_ids=feature["labels"]) for feature in features]
-        labels_batch: BatchEncoding = self.processor.tokenizer.pad(
-            label_features,
+        labels_batch: BatchEncoding = self.processor.pad(
+            labels=label_features,
             padding=self.padding,
             return_tensors=self.return_tensors,
             max_length=512,
@@ -192,7 +195,9 @@ def load_model(self) -> Wav2Vec2ForCTC:
 
     def load_data_collator(self) -> DataCollatorCTCWithPadding:
         return DataCollatorCTCWithPadding(
-            processor=self.processor, padding=self.cfg.padding
+            processor=self.processor,
+            max_seconds_per_example=self.cfg.max_seconds_per_example,
+            padding=self.cfg.padding,
         )
 
     def load_trainer_class(self) -> Type[Trainer]:
@@ -278,7 +283,9 @@ def load_saved(self) -> PreTrainedModelData:
 
         model = Wav2Vec2ForCTC.from_pretrained(self.cfg.hub_id, token=True)
         data_collator = DataCollatorCTCWithPadding(
-            processor=processor, padding=self.cfg.padding
+            processor=processor,
+            max_seconds_per_example=self.cfg.max_seconds_per_example,
+            padding=self.cfg.padding,
         )
         compute_metrics = partial(compute_wer_metrics, processor=processor)
         return PreTrainedModelData(
@@ -299,7 +306,7 @@ def dump_vocabulary(cfg: DictConfig) -> None:
             The Hydra configuration object.
     """
     # Build the set of all unique characters in the dataset
-    unique_characters: set[str] = set(cfg.characters_to_keep)
+    unique_characters: set[str] = set(cfg.characters_to_keep + "|")
 
     # Build vocabulary
     vocab = {char: idx for idx, char in enumerate(unique_characters)}

From 627039919e43d54c4b3caa103fb49877a9f48a7f Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Thu, 7 Dec 2023 13:00:23 +0100
Subject: [PATCH 11/20] fix: Ensure that we pad with pad_token when using a LM
 decoder

---
 src/coral_models/compute_metrics.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py
index 71a594c9..b3283aff 100644
--- a/src/coral_models/compute_metrics.py
+++ b/src/coral_models/compute_metrics.py
@@ -40,7 +40,12 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
         if predictions.dtype == np.int_:
             vocab_size = tokenizer.get_vocab()
             mismatch_dim = len(vocab_size) - predictions.shape[-1]
-            predictions = np.pad(predictions, ((0, 0), (0, 0), (0, mismatch_dim)))
+            predictions = np.pad(
+                array=predictions,
+                pad_width=((0, 0), (0, 0), (0, mismatch_dim)),
+                mode="constant",
+                constant_values=pad_token,
+            )
             predictions_str = tokenizer.batch_decode(sequences=predictions)
 
         # Otherwise, if we are not using a language model, we need to convert the

From 4c0d09a637d31e76869f570becc4c1799e89198f Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Thu, 7 Dec 2023 13:00:45 +0100
Subject: [PATCH 12/20] fix: Ensure that pad_token is chosen when all logits
 for a token are -100

---
 src/coral_models/compute_metrics.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/coral_models/compute_metrics.py b/src/coral_models/compute_metrics.py
index b3283aff..9e252ecf 100644
--- a/src/coral_models/compute_metrics.py
+++ b/src/coral_models/compute_metrics.py
@@ -51,11 +51,16 @@ def compute_wer_metrics(pred: EvalPrediction, processor: Processor) -> dict[str,
         # Otherwise, if we are not using a language model, we need to convert the
         # logits to token IDs and then decode the token IDs to get the predicted string
         else:
+            # If all the logits are -100 for a token, then we set the logit for the
+            # padding token for that token to 0. This is to ensure that this token gets
+            # decoded to a padding token, and are therefore ignored
+            predictions[np.all(predictions == -100, axis=-1), pad_token] = 0
+
             pred_ids: NDArray[np.int_] = np.argmax(predictions, axis=-1)
             predictions_str = tokenizer.batch_decode(pred_ids)
 
     elif len(predictions.shape) == 2 and predictions.dtype == np.int_:
-        predictions_str = tokenizer.batch_decode(predictions)
+        predictions_str = tokenizer.batch_decode(sequences=predictions)
 
     else:
         raise ValueError(

From 5d15643f8043264bae560b2eb03c2c4004027c66 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Thu, 7 Dec 2023 13:01:38 +0100
Subject: [PATCH 13/20] fix: Do not add special tokens to vocab, as then they
 won't count as special tokens

---
 src/coral_models/wav2vec2.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
index 2944d853..f49ba47b 100644
--- a/src/coral_models/wav2vec2.py
+++ b/src/coral_models/wav2vec2.py
@@ -128,8 +128,8 @@ def load_processor(self) -> Wav2Vec2Processor:
                 dump_vocabulary(self.cfg)
                 tokenizer: Wav2Vec2CTCTokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                     self.cfg.model_dir,
-                    unk_token="<unk>",
                     pad_token="<pad>",
+                    unk_token="<unk>",
                     bos_token="<s>",
                     eos_token="</s>",
                     word_delimiter_token="|",
@@ -310,8 +310,6 @@ def dump_vocabulary(cfg: DictConfig) -> None:
 
     # Build vocabulary
     vocab = {char: idx for idx, char in enumerate(unique_characters)}
-    for tok in ["<unk>", "<pad>", "<s>", "</s>"]:
-        vocab[tok] = len(vocab)
 
     # Dump the vocabulary to a json file
     model_dir = Path(cfg.model_dir)

From 4818ec82738f346c38f6fc885b3432c965f1a3ea Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Thu, 7 Dec 2023 13:02:26 +0100
Subject: [PATCH 14/20] fix: Update padding kwargs in Whisper analogous to
 Wav2Vec2

---
 src/coral_models/whisper.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index 3ade4940..6c459097 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -77,15 +77,23 @@ def torch_call(self, features: list[dict]) -> BatchFeature:
             raise ValueError(
                 "Features must contain either 'input_features' or 'audio' key."
             )
-        batch = self.processor.feature_extractor.pad(
-            audio_features, return_tensors="pt"
+        batch = self.processor.pad(
+            audio_features,
+            padding=self.padding,
+            return_tensors=self.return_tensors,
+            max_length=16_000 * self.max_seconds_per_example,
         )
 
         # Get the tokenized label sequences
         label_features = [{"input_ids": feature["labels"]} for feature in features]
 
         # Pad the labels to max length
-        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            return_tensors=self.return_tensors,
+            max_length=512,
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(

From 7a9d4bb068d1485cc9408383b9803d8e954fe5ca Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Thu, 7 Dec 2023 17:15:02 +0100
Subject: [PATCH 15/20] fix: Padding with a WhisperProcessor

---
 src/coral_models/whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index 6c459097..f460d4bb 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -77,7 +77,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature:
             raise ValueError(
                 "Features must contain either 'input_features' or 'audio' key."
             )
-        batch = self.processor.pad(
+        batch = self.processor.feature_extractor.pad(
             audio_features,
             padding=self.padding,
             return_tensors=self.return_tensors,
@@ -88,7 +88,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature:
         label_features = [{"input_ids": feature["labels"]} for feature in features]
 
         # Pad the labels to max length
-        labels_batch = self.processor.pad(
+        labels_batch = self.processor.tokenizer.pad(
             labels=label_features,
             padding=self.padding,
             return_tensors=self.return_tensors,

From 56d617c0b08450ea0c7ff1d697a6272f87b99b3e Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Wed, 13 Dec 2023 12:36:09 +0100
Subject: [PATCH 16/20] fix: Add `max_seconds_per_example` to Whisper Processor

---
 src/coral_models/wav2vec2.py | 2 +-
 src/coral_models/whisper.py  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/coral_models/wav2vec2.py b/src/coral_models/wav2vec2.py
index f49ba47b..983269c1 100644
--- a/src/coral_models/wav2vec2.py
+++ b/src/coral_models/wav2vec2.py
@@ -62,8 +62,8 @@ class DataCollatorCTCWithPadding(DataCollatorMixin):
     """
 
     processor: Wav2Vec2Processor
-    padding: bool | str
     max_seconds_per_example: float
+    padding: bool | str
     return_tensors: str = "pt"
 
     def torch_call(self, features: list[dict]) -> BatchFeature:
diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index f460d4bb..1553c2d3 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -36,6 +36,8 @@ class DataCollatorSpeechSeq2SeqWithPadding(DataCollatorMixin):
     Args:
         processor (WhisperProcessor)
             The processor used for proccessing the data.
+        max_seconds_per_example (float):
+            The maximum number of seconds per example.
         padding (bool, str or PaddingStrategy, optional):
             Select a strategy to pad the returned sequences (according to the model's
             padding side and padding index) among:
@@ -53,6 +55,7 @@ class DataCollatorSpeechSeq2SeqWithPadding(DataCollatorMixin):
     """
 
     processor: WhisperProcessor
+    max_seconds_per_example: float
     padding: bool | str = True
     return_tensors: str = "pt"
 

From 61d2b04b7ae1c5f5721301ca7498b5771b9f2b93 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Wed, 13 Dec 2023 12:36:38 +0100
Subject: [PATCH 17/20] chore: Change config

---
 config/config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 2b55cc00..effb6745 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -47,13 +47,13 @@ ignore_data_skip: false
 save_total_limit: 2
 
 # Optimisation parameters
-learning_rate: 3e-5
+learning_rate: 1e-4
 adam_first_momentum: 0.9
 adam_second_momentum: 0.98
 total_batch_size: 256
-per_device_batch_size: 64
-max_steps: 120_000
-warmup_steps: 10_000
+per_device_batch_size: 16
+max_steps: 10_000
+warmup_steps: 1_000
 logging_steps: 10
 eval_steps: 100
 save_steps: 100

From 2f7573bc6ef5e27fe0dbadfd2c331f8c0fd145ff Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Wed, 13 Dec 2023 12:43:48 +0100
Subject: [PATCH 18/20] fix: Add max_seconds_per_example as argument to
 DataCollatorSpeechSeq2SeqWithPadding

---
 src/coral_models/whisper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index 1553c2d3..bf6372b2 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -173,7 +173,9 @@ def load_model(self) -> WhisperForConditionalGeneration:
 
     def load_data_collator(self) -> DataCollatorSpeechSeq2SeqWithPadding:
         return DataCollatorSpeechSeq2SeqWithPadding(
-            processor=self.processor, padding=self.cfg.padding
+            processor=self.processor,
+            max_seconds_per_example=self.cfg.data.max_seconds_per_example,
+            padding=self.cfg.padding,
         )
 
     def load_trainer_class(self) -> Type[Trainer]:

From 6430fd782e45cec945926cb8e1f832639aa54e93 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Wed, 13 Dec 2023 12:52:09 +0100
Subject: [PATCH 19/20] fix: Typo in config max_seconds_per_example

---
 src/coral_models/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index bf6372b2..fa59a5b6 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -174,7 +174,7 @@ def load_model(self) -> WhisperForConditionalGeneration:
     def load_data_collator(self) -> DataCollatorSpeechSeq2SeqWithPadding:
         return DataCollatorSpeechSeq2SeqWithPadding(
             processor=self.processor,
-            max_seconds_per_example=self.cfg.data.max_seconds_per_example,
+            max_seconds_per_example=self.cfg.max_seconds_per_example,
             padding=self.cfg.padding,
         )
 

From 6b801616eaa3dcd9796af6c0d18a44113c2f8f64 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
Date: Wed, 13 Dec 2023 13:02:00 +0100
Subject: [PATCH 20/20] fix: Remove `labels` kwarg from Whisper tokenizer
 padding

---
 src/coral_models/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coral_models/whisper.py b/src/coral_models/whisper.py
index fa59a5b6..a30991ff 100644
--- a/src/coral_models/whisper.py
+++ b/src/coral_models/whisper.py
@@ -92,7 +92,7 @@ def torch_call(self, features: list[dict]) -> BatchFeature:
 
         # Pad the labels to max length
         labels_batch = self.processor.tokenizer.pad(
-            labels=label_features,
+            label_features,
             padding=self.padding,
             return_tensors=self.return_tensors,
             max_length=512,