Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add config #19

Merged
merged 12 commits into from
Jan 30, 2025
31 changes: 11 additions & 20 deletions demo/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,12 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcebc6a2-b74c-47b4-bd94-b945bef9177f",
"metadata": {},
"cell_type": "code",
"outputs": [],
"source": [
"from src.speech_to_text_finetune.finetune_whisper import run_finetuning"
]
"execution_count": null,
"source": "from speech_to_text_finetune.finetune_whisper import run_finetuning",
"id": "da07095b78eba3c0"
},
{
"cell_type": "markdown",
Expand All @@ -139,7 +137,9 @@
"metadata": {},
"outputs": [],
"source": [
"model_id = \"openai/whisper-tiny\"\n",
"# @title Finetuning configuration and hyperparameter setting\n",
"\n",
"model_id = \"openai/whisper-tiny\" # @ [\"openai/whisper-tiny\", \"openai/whisper-small\", \"openai/whisper-medium\"]\n",
"dataset_id = \"mozilla-foundation/common_voice_17_0\"\n",
"language = \"Greek\"\n",
"\n",
Expand All @@ -161,21 +161,12 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e885747a52fc2b3",
"metadata": {},
"cell_type": "code",
"outputs": [],
"source": [
"run_finetuning(\n",
" model_id=model_id,\n",
" dataset_id=dataset_id,\n",
" language=language,\n",
" repo_name=repo_name,\n",
" max_steps=test_max_steps,\n",
" private_hf_repo=make_repo_private,\n",
")"
]
"execution_count": null,
"source": "run_finetuning(config_path=\"src/speech_to_text_finetune/config.yaml\")",
"id": "73ef3bd5de291da3"
}
],
"metadata": {
Expand Down
53 changes: 53 additions & 0 deletions src/speech_to_text_finetune/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import yaml
from pydantic import BaseModel


def load_config(config_path: str):
with open(config_path, "r") as file:
config_dict = yaml.safe_load(file)

return Config(**config_dict)


class TrainingConfig(BaseModel):
"""
More info at https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
"""

push_to_hub: bool
hub_private_repo: bool
max_steps: int
per_device_train_batch_size: int
gradient_accumulation_steps: int
learning_rate: float
warmup_steps: int
gradient_checkpointing: bool
fp16: bool
eval_strategy: str
per_device_eval_batch_size: int
predict_with_generate: bool
generation_max_length: int
save_steps: int
logging_steps: int
load_best_model_at_end: bool
metric_for_best_model: str
greater_is_better: bool


class Config(BaseModel):
"""
Store configuration used for finetuning

Args:
model_id (str): HF model id of a Whisper model used for finetuning
dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
language (str): registered language string that is supported by the Common Voice dataset
repo_name (str | None): used both for local dir and HF, None will create a name based on the model and language id
training_hp (TrainingConfig): store selective hyperparameter values from Seq2SeqTrainingArguments
"""

model_id: str
dataset_id: str
language: str
repo_name: str | None
training_hp: TrainingConfig
24 changes: 24 additions & 0 deletions src/speech_to_text_finetune/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
model_id: openai/whisper-tiny
dataset_id: mozilla-foundation/common_voice_17_0
language: Greek
repo_name: None

training_hp:
push_to_hub: False
hub_private_repo: True
max_steps: 1
per_device_train_batch_size: 64
gradient_accumulation_steps: 1
learning_rate: 1e-5
warmup_steps: 50
gradient_checkpointing: True
fp16: True
eval_strategy: steps
per_device_eval_batch_size: 8
predict_with_generate: True
generation_max_length: 225
save_steps: 250
logging_steps: 25
load_best_model_at_end: True
metric_for_best_model: wer
greater_is_better: False
4 changes: 3 additions & 1 deletion src/speech_to_text_finetune/data_process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import torch
from dataclasses import dataclass
from typing import Dict, List, Union
Expand Down Expand Up @@ -60,7 +62,7 @@ def process_dataset(
_process_inputs_and_labels_for_whisper,
fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
remove_columns=dataset.column_names["train"],
num_proc=2,
num_proc=os.cpu_count(),
)
return dataset

Expand Down
110 changes: 35 additions & 75 deletions src/speech_to_text_finetune/finetune_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,81 +14,66 @@
import evaluate
from evaluate import EvaluationModule
from loguru import logger
from src.speech_to_text_finetune.data_process import (

from speech_to_text_finetune.config import load_config
from speech_to_text_finetune.data_process import (
load_common_voice,
DataCollatorSpeechSeq2SeqWithPadding,
process_dataset,
)
from src.speech_to_text_finetune.hf_utils import (
from speech_to_text_finetune.hf_utils import (
get_hf_username,
upload_custom_hf_model_card,
get_available_languages_in_cv,
)

hf_username = get_hf_username()
dataset_id_cv = "mozilla-foundation/common_voice_17_0"
model_id_whisper = "openai/whisper-tiny"
test_language = "Greek"

test_repo_name = "testing" # None for default name, or set your own
test_max_steps = 100
push_to_hf = True
make_repo_private = False


def run_finetuning(
model_id: str,
dataset_id: str,
language: str,
repo_name: str | None,
max_steps: int = 2000,
private_hf_repo: bool = True,
) -> Tuple[Dict, Dict]:

def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
"""
Complete pipeline for preprocessing the Common Voice dataset and then finetuning a Whisper model on it.

Args:
model_id (str): HF model id of a Whisper model used for finetuning
dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
language (str): registered language string that is supported by the Common Voice dataset
repo_name (str): repo ID that will be used for storing artifacts both locally and on HF
max_steps (int): number of steps to run the training job, defaults to 2000
private_hf_repo (bool): flag whether to make the HF public (False) or private (True)
config_path (str): The filepath to a yaml file that follows the format defined in config.py

Returns:
Tuple[Dict, Dict]: evaluation metrics from the baseline and the finetuned models
"""
cfg = load_config(config_path)

languages_name_to_id = get_available_languages_in_cv(dataset_id)
language_id = languages_name_to_id[language]
hf_username = get_hf_username()
daavoo marked this conversation as resolved.
Show resolved Hide resolved

if not repo_name:
repo_name = f"{model_id.split('/')[1]}-{language_id}"
hf_repo_name = f"{hf_username}/{repo_name}"
local_output_dir = f"./artifacts/{repo_name}"
languages_name_to_id = get_available_languages_in_cv(cfg.dataset_id)
language_id = languages_name_to_id[cfg.language]

if not cfg.repo_name:
cfg.repo_name = f"{cfg.model.model_id.split('/')[1]}-{language_id}"
hf_repo_name = f"{hf_username}/{cfg.repo_name}"
local_output_dir = f"./artifacts/{cfg.repo_name}"

logger.info(
f"Finetuning job will soon start. "
f"Results will be saved local at {local_output_dir} uploaded in HF at {hf_repo_name}. "
f"Private repo is set to {private_hf_repo}."
f"Private repo is set to {cfg.training_hp.hub_private_repo}."
)

logger.info(f"Loading the {language} subset from the {dataset_id} dataset.")
dataset = load_common_voice(dataset_id, language_id)
logger.info(f"Loading the {cfg.language} subset from the {cfg.dataset_id} dataset.")
dataset = load_common_voice(cfg.dataset_id, language_id)

device = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"

logger.info(f"Loading {model_id} on {device} and configuring it for {language}.")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
logger.info(
f"Loading {cfg.model_id} on {device} and configuring it for {cfg.language}."
)
feature_extractor = WhisperFeatureExtractor.from_pretrained(cfg.model_id)
tokenizer = WhisperTokenizer.from_pretrained(
model_id, language=language, task="transcribe"
cfg.model_id, language=cfg.language, task="transcribe"
)
processor = WhisperProcessor.from_pretrained(
model_id, language=language, task="transcribe"
cfg.model_id, language=cfg.language, task="transcribe"
)
model = WhisperForConditionalGeneration.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(cfg.model_id)

model.generation_config.language = language.lower()
model.generation_config.language = cfg.language.lower()
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

Expand All @@ -102,27 +87,9 @@ def run_finetuning(

training_args = Seq2SeqTrainingArguments(
output_dir=local_output_dir,
per_device_train_batch_size=64,
gradient_accumulation_steps=1,
learning_rate=1e-5,
warmup_steps=50,
max_steps=max_steps,
gradient_checkpointing=True,
fp16=True,
eval_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=250,
eval_steps=250,
logging_steps=25,
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
report_to=["tensorboard"],
push_to_hub=push_to_hf,
hub_model_id=hf_repo_name,
hub_private_repo=private_hf_repo,
report_to=["tensorboard"],
**cfg.training_hp.dict(),
)

metric = evaluate.load("wer")
Expand All @@ -142,7 +109,7 @@ def run_finetuning(
processor.save_pretrained(training_args.output_dir)

logger.info(
f"Before finetuning, run evaluation on the baseline model {model_id} to easily compare performance"
f"Before finetuning, run evaluation on the baseline model {cfg.model_id} to easily compare performance"
f" before and after finetuning"
)
baseline_eval_results = trainer.evaluate()
Expand All @@ -159,15 +126,15 @@ def run_finetuning(
eval_results = trainer.evaluate()
logger.info(f"Evaluation complete. Results:\n\t {eval_results}")

if push_to_hf:
if cfg.training_hp.push_to_hf:
logger.info(f"Uploading model and eval results to HuggingFace: {hf_repo_name}")
trainer.push_to_hub()
upload_custom_hf_model_card(
hf_repo_name=hf_repo_name,
model_id=model_id,
dataset_id=dataset_id,
model_id=cfg.model_id,
dataset_id=cfg.dataset_id,
language_id=language_id,
language=language,
language=cfg.language,
n_train_samples=dataset["train"].num_rows,
n_eval_samples=dataset["test"].num_rows,
baseline_eval_results=baseline_eval_results,
Expand Down Expand Up @@ -216,11 +183,4 @@ def compute_word_error_rate(


if __name__ == "__main__":
run_finetuning(
model_id=model_id_whisper,
dataset_id=dataset_id_cv,
language=test_language,
repo_name=test_repo_name,
max_steps=test_max_steps,
private_hf_repo=make_repo_private,
)
run_finetuning(config_path="src/speech_to_text_finetune/config.yaml")
4 changes: 2 additions & 2 deletions src/speech_to_text_finetune/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def upload_custom_hf_model_card(
ft_eval_results: Dict,
) -> None:
"""
Create and upload a custom Model Card (<TODO: hf reference here>) to the Hugging Face repo of the finetuned model
that highlights the evaluation results before and after finetuning.
Create and upload a custom Model Card (https://huggingface.co/docs/hub/model-cards) to the Hugging Face repo
of the finetuned model that highlights the evaluation results before and after finetuning.
"""

card_metadata = ModelCardData(
Expand Down