From ee1a6fabef26bb07465bf38abebcc211b974bee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Thu, 5 Sep 2024 22:24:18 +0200 Subject: [PATCH 01/14] Add first scratch of implementation for maestro CLI --- maestro/cli/__init__.py | 0 maestro/cli/env.py | 2 + maestro/cli/introspection.py | 36 +++++ maestro/cli/main.py | 15 ++ maestro/cli/utils.py | 2 + .../trainer/common/utils/metrics_tracing.py | 61 +++---- maestro/trainer/models/florence_2/entities.py | 41 +++++ .../trainer/models/florence_2/entrypoint.py | 152 ++++++++++++++++++ maestro/trainer/models/florence_2/training.py | 52 ++---- .../trainer/models/paligemma/entrypoint.py | 13 ++ requirements/requirements.txt | 3 +- setup.py | 5 + 12 files changed, 298 insertions(+), 84 deletions(-) create mode 100644 maestro/cli/__init__.py create mode 100644 maestro/cli/env.py create mode 100644 maestro/cli/introspection.py create mode 100644 maestro/cli/main.py create mode 100644 maestro/cli/utils.py create mode 100644 maestro/trainer/models/florence_2/entrypoint.py create mode 100644 maestro/trainer/models/paligemma/entrypoint.py diff --git a/maestro/cli/__init__.py b/maestro/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/maestro/cli/env.py b/maestro/cli/env.py new file mode 100644 index 0000000..b95525e --- /dev/null +++ b/maestro/cli/env.py @@ -0,0 +1,2 @@ +DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "DISABLE_RECIPE_IMPORTS_WARNINGS" +DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "False" diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py new file mode 100644 index 0000000..3a62ce3 --- /dev/null +++ b/maestro/cli/introspection.py @@ -0,0 +1,36 @@ +import os + +import typer + +from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV +from maestro.cli.utils import str2bool + + +def find_training_recipes(app: typer.Typer) -> None: + try: + from maestro.trainer.models.florence_2.entrypoint import florence_2_app + + app.add_typer(florence_2_app, name="florence2") + except Exception: + _warn_about_recipe_import_error(model_name="Florence 2") + + try: + from maestro.trainer.models.paligemma.entrypoint import paligemma_app + + app.add_typer(paligemma_app, name="paligemma") + except Exception: + _warn_about_recipe_import_error(model_name="PaliGemma") + + +def _warn_about_recipe_import_error(model_name: str) -> None: + disable_warnings = str2bool( + os.getenv( + DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, + DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, + ) + ) + if disable_warnings: + return None + warning = typer.style("WARNING", fg=typer.colors.RED, bold=True) + message = "🚧 " + warning + f" cannot import recipe for {model_name}" + typer.echo(message) diff --git a/maestro/cli/main.py b/maestro/cli/main.py new file mode 100644 index 0000000..b600e3a --- /dev/null +++ b/maestro/cli/main.py @@ -0,0 +1,15 @@ +import typer + +from maestro.cli.introspection import find_training_recipes + +app = typer.Typer() +find_training_recipes(app=app) + + +@app.command(help="Display information about maestro") +def info(): + typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋") + + +if __name__ == "__main__": + app() diff --git a/maestro/cli/utils.py b/maestro/cli/utils.py new file mode 100644 index 0000000..0751fef --- /dev/null +++ b/maestro/cli/utils.py @@ -0,0 +1,2 @@ +def str2bool(value: str) -> bool: + return value.lower() in {"y", "t", "yes", "true"} diff --git a/maestro/trainer/common/utils/metrics_tracing.py b/maestro/trainer/common/utils/metrics_tracing.py index d99bbbe..c7b011e 100644 --- a/maestro/trainer/common/utils/metrics_tracing.py +++ b/maestro/trainer/common/utils/metrics_tracing.py @@ -3,7 +3,7 @@ import json import os from collections import defaultdict -from typing import Dict, Tuple, List +from typing import Dict, Tuple, List, Optional import matplotlib.pyplot as plt @@ -33,23 +33,17 @@ def get_metric_values( return [value[2] for value in self._metrics[metric]] def as_json( - self, - output_dir: str = None, - filename: str = None + self, output_dir: Optional[str] = None, filename: Optional[str] = None ) -> Dict[str, List[Dict[str, float]]]: metrics_data = {} for metric, values in self._metrics.items(): - metrics_data[metric] = [ - {'epoch': epoch, 'step': step, 'value': value} - for epoch, step, value - in values - ] + metrics_data[metric] = [{"epoch": epoch, "step": step, "value": value} for epoch, step, value in values] if output_dir and filename: if not os.path.exists(output_dir): os.makedirs(output_dir) filepath = os.path.join(output_dir, filename) - with open(filepath, 'w') as file: + with open(filepath, "w") as file: json.dump(metrics_data, file, indent=4) return metrics_data @@ -59,19 +53,11 @@ def aggregate_by_epoch(metric_values: List[Tuple[int, int, float]]) -> Dict[int, epoch_data = defaultdict(list) for epoch, step, value in metric_values: epoch_data[epoch].append(value) - avg_per_epoch = { - epoch: sum(values) / len(values) - for epoch, values - in epoch_data.items() - } + avg_per_epoch = {epoch: sum(values) / len(values) for epoch, values in epoch_data.items()} return avg_per_epoch -def save_metric_plots( - training_tracker: MetricsTracker, - validation_tracker: MetricsTracker, - output_dir: str -): +def save_metric_plots(training_tracker: MetricsTracker, validation_tracker: MetricsTracker, output_dir: str): if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -83,43 +69,32 @@ def save_metric_plots( plt.figure(figsize=(8, 6)) if metric in training_metrics: - training_values = training_tracker.get_metric_values( - metric=metric, with_index=True) + training_values = training_tracker.get_metric_values(metric=metric, with_index=True) training_avg_values = aggregate_by_epoch(training_values) training_epochs = sorted(training_avg_values.keys()) training_vals = [training_avg_values[epoch] for epoch in training_epochs] plt.plot( - training_epochs, - training_vals, - label=f'Training {metric}', - marker='o', - linestyle='-', - color='blue' + training_epochs, training_vals, label=f"Training {metric}", marker="o", linestyle="-", color="blue" ) if metric in validation_metrics: - validation_values = validation_tracker.get_metric_values( - metric=metric, with_index=True) + validation_values = validation_tracker.get_metric_values(metric=metric, with_index=True) validation_avg_values = aggregate_by_epoch(validation_values) validation_epochs = sorted(validation_avg_values.keys()) - validation_vals = [ - validation_avg_values[epoch] - for epoch - in validation_epochs - ] + validation_vals = [validation_avg_values[epoch] for epoch in validation_epochs] plt.plot( validation_epochs, validation_vals, - label=f'Validation {metric}', - marker='o', - linestyle='--', - color='orange' + label=f"Validation {metric}", + marker="o", + linestyle="--", + color="orange", ) - plt.title(f'{metric.capitalize()} over Epochs') - plt.xlabel('Epoch') - plt.ylabel(f'{metric.capitalize()} Value') + plt.title(f"{metric.capitalize()} over Epochs") + plt.xlabel("Epoch") + plt.ylabel(f"{metric.capitalize()} Value") plt.legend() plt.grid(True) - plt.savefig(f'{output_dir}/{metric}_plot.png') + plt.savefig(f"{output_dir}/{metric}_plot.png") plt.close() diff --git a/maestro/trainer/models/florence_2/entities.py b/maestro/trainer/models/florence_2/entities.py index e69de29..17f4019 100644 --- a/maestro/trainer/models/florence_2/entities.py +++ b/maestro/trainer/models/florence_2/entities.py @@ -0,0 +1,41 @@ +import os +from dataclasses import dataclass +from typing import Optional, Literal, Union + +import torch + +from maestro.trainer.common.configuration.env import CUDA_DEVICE_ENV, DEFAULT_CUDA_DEVICE + + +LoraInitLiteral = Literal["gaussian", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"] + + +DEFAULT_FLORENCE2_MODEL_ID = "microsoft/Florence-2-base-ft" +DEFAULT_FLORENCE2_MODEL_REVISION = "refs/pr/20" +DEVICE = os.getenv(CUDA_DEVICE_ENV, DEFAULT_CUDA_DEVICE) + + +@dataclass(frozen=True) +class TrainingConfiguration: + dataset_location: str + model_id_or_path: str = DEFAULT_FLORENCE2_MODEL_ID + revision: str = DEFAULT_FLORENCE2_MODEL_REVISION + device: torch.device = torch.device(DEVICE) + transformers_cache_dir: Optional[str] = None + training_epochs: int = 10 + optimiser: Literal["SGD", "adamw", "adam"] = "adamw" + learning_rate: float = 1e-5 + lr_scheduler: Literal["linear", "cosine", "polynomial"] = "linear" + train_batch_size: int = 4 + test_batch_size: Optional[int] = None + loaders_workers: int = 0 + test_loaders_workers: Optional[int] = None + lora_r: int = 8 + lora_alpha: int = 8 + lora_dropout: float = 0.05 + bias: Literal["none", "all", "lora_only"] = "none" + use_rslora: bool = True + init_lora_weights: Union[bool, LoraInitLiteral] = "gaussian" + training_dir: str = "./training/florence-2" + max_checkpoints_to_keep: int = 3 + num_samples_to_visualise: int = 64 diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py new file mode 100644 index 0000000..09c2d87 --- /dev/null +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -0,0 +1,152 @@ +import dataclasses +from typing import Optional, Annotated + +import torch +import typer +import rich + +from maestro.trainer.models.florence_2.entities import ( + TrainingConfiguration, + DEFAULT_FLORENCE2_MODEL_ID, + DEFAULT_FLORENCE2_MODEL_REVISION, + DEVICE, +) + +from maestro.trainer.models.florence_2.training import train as train_fun + +florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") + + +@florence_2_app.command( + help="Train Florence 2 model", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +) +def train( + dataset_location: Annotated[ + str, + typer.Option("--dataset_location", help="Path to directory with dataset"), + ], + model_id_or_path: Annotated[ + str, + typer.Option("--model_id_or_path", help="Model to be used or path to your checkpoint"), + ] = DEFAULT_FLORENCE2_MODEL_ID, + revision: Annotated[ + str, + typer.Option("--revision", help="Revision of Florence2 HF repository"), + ] = DEFAULT_FLORENCE2_MODEL_REVISION, + device: Annotated[ + str, + typer.Option("--device", help="CUDA device ID to be used (in format: 'cuda:0')"), + ] = DEVICE, + transformers_cache_dir: Annotated[ + Optional[str], + typer.Option("--transformers_cache_dir", help="Cache dir for HF weights"), + ] = None, + training_epochs: Annotated[ + int, + typer.Option("--training_epochs", help="Number of training epochs"), + ] = 10, + optimiser: Annotated[ + str, + typer.Option("--optimiser", help="Optimiser to be used"), + ] = "adamw", + learning_rate: Annotated[ + float, + typer.Option("--learning_rate", help="Learning rate"), + ] = 1e-5, + lr_scheduler: Annotated[ + str, + typer.Option("--lr_scheduler", help="LR scheduler"), + ] = "linear", + train_batch_size: Annotated[ + int, + typer.Option("--train_batch_size", help="Batch size for training"), + ] = 4, + test_batch_size: Annotated[ + Optional[int], + typer.Option( + "--train_batch_size", help="Batch size for validation and test. If not given - train will be used." + ), + ] = None, + loaders_workers: Annotated[ + int, + typer.Option("--loaders_workers", help="Number of loaders workers. 0 = # of CPU"), + ] = 0, + test_loaders_workers: Annotated[ + Optional[int], + typer.Option( + "--test_loaders_workers", + help="Number of workers for test and val loaders. If not given - train will be used.", + ), + ] = None, + lora_r: Annotated[ + int, + typer.Option("--lora_r", help="Value of Lora R"), + ] = 8, + lora_alpha: Annotated[ + int, + typer.Option("--lora_alpha", help="Value of Lora Alpha"), + ] = 8, + lora_dropout: Annotated[ + float, + typer.Option("--lora_dropout", help="Value of Lora Dropout"), + ] = 0.05, + bias: Annotated[ + str, + typer.Option("--bias", help="Value of Lora Bias"), + ] = "none", + use_rslora: Annotated[ + bool, + typer.Option( + "--use_rslora/--no_use_rslora", + help="Boolean flag to decide if rslora to be used", + ), + ] = True, + init_lora_weights: Annotated[ + str, + typer.Option("--init_lora_weights", help="Lora weights initialisation"), + ] = "gaussian", + training_dir: Annotated[ + str, + typer.Option("--training_dir", help="Path to directory where training outputs should be preserved"), + ] = "./training/florence-2", + max_checkpoints_to_keep: Annotated[ + int, + typer.Option("--max_checkpoints_to_keep", help="Max checkpoints to keep"), + ] = 3, + num_samples_to_visualise: Annotated[ + int, + typer.Option("--num_samples_to_visualise", help="Number of samples to visualise"), + ] = 64, +) -> None: + configuration = TrainingConfiguration( + dataset_location=dataset_location, + model_id_or_path=model_id_or_path, + revision=revision, + device=torch.device(device), + transformers_cache_dir=transformers_cache_dir, + training_epochs=training_epochs, + optimiser=optimiser, # type: ignore + learning_rate=learning_rate, + lr_scheduler=lr_scheduler, # type: ignore + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + loaders_workers=loaders_workers, + test_loaders_workers=test_loaders_workers, + lora_r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + bias=bias, # type: ignore + use_rslora=use_rslora, + init_lora_weights=init_lora_weights, # type: ignore + training_dir=training_dir, + max_checkpoints_to_keep=max_checkpoints_to_keep, + num_samples_to_visualise=num_samples_to_visualise, + ) + typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True)) + rich.print(dataclasses.asdict(configuration)) + train_fun(configuration=configuration) + + +@florence_2_app.command(help="Evaluate Florence 2 model") +def evaluate() -> None: + pass diff --git a/maestro/trainer/models/florence_2/training.py b/maestro/trainer/models/florence_2/training.py index aaa1687..7e7ae56 100644 --- a/maestro/trainer/models/florence_2/training.py +++ b/maestro/trainer/models/florence_2/training.py @@ -1,6 +1,6 @@ import os import shutil -from dataclasses import replace, dataclass +from dataclasses import replace from glob import glob from typing import Optional, Tuple, List, Literal, Union @@ -12,48 +12,20 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoProcessor, get_scheduler -from maestro.trainer.common.configuration.env import CUDA_DEVICE_ENV, \ - DEFAULT_CUDA_DEVICE from maestro.trainer.common.utils.leaderboard import CheckpointsLeaderboard -from maestro.trainer.common.utils.metrics_tracing import MetricsTracker, \ - save_metric_plots +from maestro.trainer.common.utils.metrics_tracing import MetricsTracker, save_metric_plots from maestro.trainer.common.utils.reproducibility import make_it_reproducible from maestro.trainer.models.florence_2.data_loading import prepare_data_loaders +from maestro.trainer.models.florence_2.entities import ( + DEFAULT_FLORENCE2_MODEL_ID, + DEFAULT_FLORENCE2_MODEL_REVISION, + DEVICE, + TrainingConfiguration, +) from maestro.trainer.models.florence_2.metrics import prepare_detection_training_summary from maestro.trainer.models.paligemma.training import LoraInitLiteral -DEFAULT_FLORENCE2_MODEL_ID = "microsoft/Florence-2-base-ft" -DEFAULT_FLORENCE2_MODEL_REVISION = "refs/pr/20" -DEVICE = torch.device("cpu") if not torch.cuda.is_available() else os.getenv(CUDA_DEVICE_ENV, DEFAULT_CUDA_DEVICE) - - -@dataclass(frozen=True) -class TrainingConfiguration: - dataset_location: str - model_id_or_path: str = DEFAULT_FLORENCE2_MODEL_ID - revision: str = DEFAULT_FLORENCE2_MODEL_REVISION - device: torch.device = DEVICE - transformers_cache_dir: Optional[str] = None - training_epochs: int = 10 - optimiser: Literal["SGD", "adamw", "adam"] = "adamw" - learning_rate: float = 1e-5 - lr_scheduler: Literal["linear", "cosine", "polynomial"] = "linear" - train_batch_size: int = 4 - test_batch_size: Optional[int] = None - loaders_workers: int = 0 - test_loaders_workers: Optional[int] = None - lora_r: int = 8 - lora_alpha: int = 8 - lora_dropout: float = 0.05 - bias: Literal["none", "all", "lora_only"] = "none" - use_rslora: bool = True - init_lora_weights: Union[bool, LoraInitLiteral] = "gaussian" - training_dir: str = "./training/florence-2" - max_checkpoints_to_keep: int = 3 - num_samples_to_visualise: int = 64 - - def train(configuration: TrainingConfiguration) -> None: make_it_reproducible(avoid_non_deterministic_algorithms=False) training_run_dir = _establish_training_run_dir( @@ -128,11 +100,11 @@ def train(configuration: TrainingConfiguration) -> None: output_dir=os.path.join(configuration.training_dir, "metrics"), ) training_metrics_tracker.as_json( - output_dir=os.path.join(configuration.training_dir, "metrics"), - filename="training.json") + output_dir=os.path.join(configuration.training_dir, "metrics"), filename="training.json" + ) validation_metrics_tracker.as_json( - output_dir=os.path.join(configuration.training_dir, "metrics"), - filename="validation.json") + output_dir=os.path.join(configuration.training_dir, "metrics"), filename="validation.json" + ) for split_name in ["valid", "test"]: prepare_detection_training_summary( diff --git a/maestro/trainer/models/paligemma/entrypoint.py b/maestro/trainer/models/paligemma/entrypoint.py new file mode 100644 index 0000000..6ef6dc6 --- /dev/null +++ b/maestro/trainer/models/paligemma/entrypoint.py @@ -0,0 +1,13 @@ +import typer + +paligemma_app = typer.Typer(help="Fine-tune and evaluate PaliGemma model") + + +@paligemma_app.command(help="Train PaliGemma model") +def train() -> None: + typer.echo("🚧 Just a placeholder - to be implemented 🚧") + + +@paligemma_app.command(help="Evaluate PaliGemma model") +def evaluate() -> None: + typer.echo("🚧 Just a placeholder - to be implemented 🚧") diff --git a/requirements/requirements.txt b/requirements/requirements.txt index a181a1a..e06e303 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -7,4 +7,5 @@ sentencepiece~=0.2.0 peft~=0.12.0 flash-attn~=2.6.3 # does not work on mac einops~=0.8.0 -timm~=1.0.9 \ No newline at end of file +timm~=1.0.9 +typer~=0.12.5 diff --git a/setup.py b/setup.py index 9e16b7b..293f4e9 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,11 @@ def read_requirements(path: Union[str, List[str]]) -> List[str]: "dev": read_requirements("requirements/requirements.test.txt"), "docs": read_requirements("requirements/requirements.docs.txt"), }, + entry_points={ + "console_scripts": [ + "maestro=maestro.cli.main:app", + ], + }, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", From dad39bab6ff3335b8843f5fff65538e840ac866a Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 16:00:08 +0200 Subject: [PATCH 02/14] TrainingConfiguration filed names refactoer --- maestro/cli/introspection.py | 3 +- maestro/trainer/models/florence_2/training.py | 166 +++++++++++------- 2 files changed, 100 insertions(+), 69 deletions(-) diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py index 3a62ce3..fc6aef9 100644 --- a/maestro/cli/introspection.py +++ b/maestro/cli/introspection.py @@ -2,7 +2,8 @@ import typer -from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV +from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, \ + DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV from maestro.cli.utils import str2bool diff --git a/maestro/trainer/models/florence_2/training.py b/maestro/trainer/models/florence_2/training.py index 702546b..1885a9f 100644 --- a/maestro/trainer/models/florence_2/training.py +++ b/maestro/trainer/models/florence_2/training.py @@ -28,68 +28,95 @@ @dataclass(frozen=True) class TrainingConfiguration: - dataset_location: str - model_id_or_path: str = DEFAULT_FLORENCE2_MODEL_ID + """Configuration for training a Florence-2 model. + + This class encapsulates all the parameters needed for training a Florence-2 model, + including dataset paths, model specifications, training hyperparameters, and output settings. + + Attributes: + dataset_path (str): Path to the dataset used for training. + model_id (str): Identifier for the Florence-2 model. Defaults to DEFAULT_FLORENCE2_MODEL_ID. + revision (str): Revision of the model to use. Defaults to DEFAULT_FLORENCE2_MODEL_REVISION. + device (torch.device): Device to use for training. Defaults to DEVICE. + cache_dir (Optional[str]): Directory to cache the model. Defaults to None. + epochs (int): Number of training epochs. Defaults to 10. + optimizer (Literal["sgd", "adamw", "adam"]): Optimizer to use for training. Defaults to "adamw". + lr (float): Learning rate for the optimizer. Defaults to 1e-5. + lr_scheduler (Literal["linear", "cosine", "polynomial"]): Learning rate scheduler. Defaults to "linear". + batch_size (int): Batch size for training. Defaults to 4. + val_batch_size (Optional[int]): Batch size for validation. Defaults to None. + num_workers (int): Number of workers for data loading. Defaults to 0. + val_num_workers (Optional[int]): Number of workers for validation data loading. Defaults to None. + lora_r (int): Rank of the LoRA update matrices. Defaults to 8. + lora_alpha (int): Scaling factor for the LoRA update. Defaults to 8. + lora_dropout (float): Dropout probability for LoRA layers. Defaults to 0.05. + bias (Literal["none", "all", "lora_only"]): Which bias to train. Defaults to "none". + use_rslora (bool): Whether to use RSLoRA. Defaults to True. + init_lora_weights (Union[bool, LoraInitLiteral]): How to initialize LoRA weights. Defaults to "gaussian". + output_dir (str): Directory to save output files. Defaults to "./training/florence-2". + metrics (List[BaseMetric]): List of metrics to track during training. Defaults to an empty list. + """ + dataset_path: str + model_id: str = DEFAULT_FLORENCE2_MODEL_ID revision: str = DEFAULT_FLORENCE2_MODEL_REVISION device: torch.device = DEVICE - transformers_cache_dir: Optional[str] = None - training_epochs: int = 10 - optimiser: Literal["SGD", "adamw", "adam"] = "adamw" - learning_rate: float = 1e-5 + cache_dir: Optional[str] = None + epochs: int = 10 + optimizer: Literal["sgd", "adamw", "adam"] = "adamw" + lr: float = 1e-5 lr_scheduler: Literal["linear", "cosine", "polynomial"] = "linear" - train_batch_size: int = 4 - test_batch_size: Optional[int] = None - loaders_workers: int = 0 - test_loaders_workers: Optional[int] = None + batch_size: int = 4 + val_batch_size: Optional[int] = None + num_workers: int = 0 + val_num_workers: Optional[int] = None lora_r: int = 8 lora_alpha: int = 8 lora_dropout: float = 0.05 bias: Literal["none", "all", "lora_only"] = "none" use_rslora: bool = True init_lora_weights: Union[bool, LoraInitLiteral] = "gaussian" - training_dir: str = "./training/florence-2" - num_samples_to_visualise: int = 64 + output_dir: str = "./training/florence-2" metrics: List[BaseMetric] = field(default_factory=list) -def train(configuration: TrainingConfiguration) -> None: +def train(config: TrainingConfiguration) -> None: make_it_reproducible(avoid_non_deterministic_algorithms=False) - training_run_dir = _establish_training_run_dir( - training_dir=configuration.training_dir, + run_dir = _establish_training_run_dir( + output_dir=config.output_dir, ) - configuration = replace( - configuration, - training_dir=training_run_dir, + config = replace( + config, + output_dir=run_dir, ) - checkpoint_manager = CheckpointManager(training_run_dir) + checkpoint_manager = CheckpointManager(run_dir) processor, model = load_model( - model_id_or_path=configuration.model_id_or_path, - revision=configuration.revision, - device=configuration.device, - cache_dir=configuration.transformers_cache_dir, + model_id_or_path=config.model_id, + revision=config.revision, + device=config.device, + cache_dir=config.cache_dir, ) train_loader, val_loader, test_loader = prepare_data_loaders( - dataset_location=configuration.dataset_location, - train_batch_size=configuration.train_batch_size, + dataset_location=config.dataset_path, + train_batch_size=config.batch_size, processor=processor, - device=configuration.device, - num_workers=configuration.loaders_workers, - test_loaders_workers=configuration.test_loaders_workers, + device=config.device, + num_workers=config.num_workers, + test_loaders_workers=config.val_num_workers, ) peft_model = prepare_peft_model( model=model, - r=configuration.lora_r, - lora_alpha=configuration.lora_alpha, - lora_dropout=configuration.lora_dropout, - bias=configuration.bias, - use_rslora=configuration.use_rslora, - init_lora_weights=configuration.init_lora_weights, - revision=configuration.revision, + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias=config.bias, + use_rslora=config.use_rslora, + init_lora_weights=config.init_lora_weights, + revision=config.revision, ) training_metrics_tracker = MetricsTracker.init(metrics=["loss"]) metrics = ["loss"] - for metric in configuration.metrics: + for metric in config.metrics: metrics += metric.describe() validation_metrics_tracker = MetricsTracker.init(metrics=metrics) @@ -97,7 +124,7 @@ def train(configuration: TrainingConfiguration) -> None: processor=processor, model=peft_model, data_loaders=(train_loader, val_loader), - configuration=configuration, + config=config, training_metrics_tracker=training_metrics_tracker, validation_metrics_tracker=validation_metrics_tracker, checkpoint_manager=checkpoint_manager @@ -106,13 +133,13 @@ def train(configuration: TrainingConfiguration) -> None: save_metric_plots( training_tracker=training_metrics_tracker, validation_tracker=validation_metrics_tracker, - output_dir=os.path.join(configuration.training_dir, "metrics"), + output_dir=os.path.join(config.output_dir, "metrics"), ) training_metrics_tracker.as_json( - output_dir=os.path.join(configuration.training_dir, "metrics"), + output_dir=os.path.join(config.output_dir, "metrics"), filename="training.json") validation_metrics_tracker.as_json( - output_dir=os.path.join(configuration.training_dir, "metrics"), + output_dir=os.path.join(config.output_dir, "metrics"), filename="validation.json") @@ -148,28 +175,28 @@ def run_training_loop( processor: AutoProcessor, model: PeftModel, data_loaders: Tuple[DataLoader, Optional[DataLoader]], - configuration: TrainingConfiguration, + config: TrainingConfiguration, training_metrics_tracker: MetricsTracker, validation_metrics_tracker: MetricsTracker, checkpoint_manager: CheckpointManager, ) -> None: train_loader, val_loader = data_loaders - optimizer = _get_optimizer(model=model, configuration=configuration) - total_num_training_steps = configuration.training_epochs * len(train_loader) + optimizer = _get_optimizer(model=model, config=config) + total_steps = config.epochs * len(train_loader) lr_scheduler = get_scheduler( - name=configuration.lr_scheduler, + name=config.lr_scheduler, optimizer=optimizer, num_warmup_steps=0, - num_training_steps=total_num_training_steps, + num_training_steps=total_steps, ) - for epoch in range(configuration.training_epochs): + for epoch in range(config.epochs): run_training_epoch( processor=processor, model=model, train_loader=train_loader, val_loader=val_loader, epoch_number=epoch + 1, - configuration=configuration, + config=config, optimizer=optimizer, lr_scheduler=lr_scheduler, training_metrics_tracker=training_metrics_tracker, @@ -184,7 +211,7 @@ def run_training_epoch( train_loader: DataLoader, val_loader: Optional[DataLoader], epoch_number: int, - configuration: TrainingConfiguration, + config: TrainingConfiguration, optimizer: Optimizer, lr_scheduler: LRScheduler, training_metrics_tracker: MetricsTracker, @@ -193,13 +220,13 @@ def run_training_epoch( ) -> None: model.train() training_losses: List[float] = [] - training_iterator = tqdm(train_loader, desc=f"Epoch {epoch_number}/{configuration.training_epochs}") + training_iterator = tqdm(train_loader, desc=f"Epoch {epoch_number}/{config.epochs}") for step_id, (inputs, answers) in enumerate(training_iterator): input_ids = inputs["input_ids"] pixel_values = inputs["pixel_values"] labels = processor.tokenizer( text=answers, return_tensors="pt", padding=True, return_token_type_ids=False - ).input_ids.to(configuration.device) + ).input_ids.to(config.device) outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels) loss = outputs.loss loss.backward() @@ -217,7 +244,7 @@ def run_training_epoch( last_100_losses = training_losses[-100:] loss_moving_average = sum(last_100_losses) / len(last_100_losses) if len(last_100_losses) > 0 else 0.0 training_iterator.set_description( - f"Epoch {epoch_number}/{configuration.training_epochs}. Loss: {round(loss_moving_average, 4)}" + f"Epoch {epoch_number}/{config.epochs}. Loss: {round(loss_moving_average, 4)}" ) if len(training_losses) > 0: avg_train_loss = sum(training_losses) / len(training_losses) @@ -230,7 +257,7 @@ def run_training_epoch( model=model, loader=val_loader, epoch_number=epoch_number, - configuration=configuration, + config=config, metrics_tracker=validation_metrics_tracker, ) @@ -243,7 +270,7 @@ def run_validation_epoch( processor: AutoProcessor, model: Union[PeftModel, AutoModelForCausalLM], loader: DataLoader, - configuration: TrainingConfiguration, + config: TrainingConfiguration, metrics_tracker: MetricsTracker, epoch_number: int ) -> None: @@ -257,7 +284,7 @@ def run_validation_epoch( return_tensors="pt", padding=True, return_token_type_ids=False - ).input_ids.to(configuration.device) + ).input_ids.to(config.device) outputs = model( input_ids=input_ids, pixel_values=pixel_values, @@ -277,12 +304,12 @@ def run_validation_epoch( dataset=loader.dataset, processor=processor, model=model, - device=configuration.device, + device=config.device, ) metrics_results = {"loss": avg_val_loss} - for metric in configuration.metrics: + for metric in config.metrics: if isinstance(metric, MeanAveragePrecisionMetric): classes = extract_unique_detection_dataset_classes(loader.dataset) targets, predictions = postprocess_florence2_output_for_mean_average_precision( @@ -308,19 +335,22 @@ def run_validation_epoch( display_results(prompts, expected_responses, generated_texts, images) -def _establish_training_run_dir(training_dir: str) -> str: - training_dir = os.path.abspath(training_dir) - existing_directory_entries = glob(os.path.join(training_dir, "*")) +def _establish_training_run_dir(output_dir: str) -> str: + output_dir = os.path.abspath(output_dir) + existing_directory_entries = glob(os.path.join(output_dir, "*")) subdirectories = [path for path in existing_directory_entries if os.path.isdir(path)] run_id = len(subdirectories) + 1 - training_run_dir = os.path.join(training_dir, str(run_id)) - os.makedirs(training_run_dir, exist_ok=True) - return training_run_dir + run_dir = os.path.join(output_dir, str(run_id)) + os.makedirs(run_dir, exist_ok=True) + return run_dir -def _get_optimizer(model: PeftModel, configuration: TrainingConfiguration) -> Optimizer: - if configuration.optimiser == "adamw": - return AdamW(model.parameters(), lr=configuration.learning_rate) - if configuration.optimiser == "adam": - return Adam(model.parameters(), lr=configuration.learning_rate) - return SGD(model.parameters(), lr=configuration.learning_rate) +def _get_optimizer(model: PeftModel, config: TrainingConfiguration) -> Optimizer: + optimizer_type = config.optimizer.lower() + if optimizer_type == "adamw": + return AdamW(model.parameters(), lr=config.lr) + if optimizer_type == "adam": + return Adam(model.parameters(), lr=config.lr) + if optimizer_type == "sgd": + return SGD(model.parameters(), lr=config.lr) + raise ValueError(f"Unsupported optimizer: {config.optimizer}") From 672f27e4397112043cfefab40d3d6eef82e9a117 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 16:52:48 +0200 Subject: [PATCH 03/14] final tests before plugging in CLI --- maestro/trainer/common/utils/file_system.py | 19 +++ maestro/trainer/models/florence_2/__init__.py | 2 + .../florence_2/{training.py => core.py} | 93 +++++----- .../trainer/models/florence_2/entrypoint.py | 14 +- .../trainer/models/florence_2/inference.py | 159 ------------------ maestro/trainer/models/florence_2/metrics.py | 3 +- 6 files changed, 73 insertions(+), 217 deletions(-) rename maestro/trainer/models/florence_2/{training.py => core.py} (85%) delete mode 100644 maestro/trainer/models/florence_2/inference.py diff --git a/maestro/trainer/common/utils/file_system.py b/maestro/trainer/common/utils/file_system.py index e258f25..da8e5a9 100644 --- a/maestro/trainer/common/utils/file_system.py +++ b/maestro/trainer/common/utils/file_system.py @@ -1,5 +1,6 @@ import json import os +from glob import glob from typing import Union, List @@ -38,3 +39,21 @@ def save_json(path: str, content: dict) -> None: def ensure_parent_dir_exists(path: str) -> None: parent_dir = os.path.dirname(os.path.abspath(path)) os.makedirs(parent_dir, exist_ok=True) + + +def create_new_run_directory(base_output_dir: str) -> str: + """ + Creates a new numbered directory for the current training run. + + Args: + base_output_dir (str): The base directory where all run directories are stored. + + Returns: + str: The path to the newly created run directory. + """ + base_output_dir = os.path.abspath(base_output_dir) + existing_run_dirs = [d for d in glob(os.path.join(base_output_dir, "*")) if os.path.isdir(d)] + new_run_number = len(existing_run_dirs) + 1 + new_run_dir = os.path.join(base_output_dir, str(new_run_number)) + os.makedirs(new_run_dir, exist_ok=True) + return new_run_dir diff --git a/maestro/trainer/models/florence_2/__init__.py b/maestro/trainer/models/florence_2/__init__.py index e69de29..836df5c 100644 --- a/maestro/trainer/models/florence_2/__init__.py +++ b/maestro/trainer/models/florence_2/__init__.py @@ -0,0 +1,2 @@ +from maestro.trainer.models.florence_2.core import TrainingConfiguration, train +from maestro.trainer.models.florence_2.metrics import MeanAveragePrecisionMetric diff --git a/maestro/trainer/models/florence_2/training.py b/maestro/trainer/models/florence_2/core.py similarity index 85% rename from maestro/trainer/models/florence_2/training.py rename to maestro/trainer/models/florence_2/core.py index 1885a9f..46ea75b 100644 --- a/maestro/trainer/models/florence_2/training.py +++ b/maestro/trainer/models/florence_2/core.py @@ -1,6 +1,5 @@ import os from dataclasses import dataclass, field, replace -from glob import glob from typing import List, Literal, Optional, Tuple, Union import torch @@ -11,6 +10,7 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoProcessor, get_scheduler +from maestro.trainer.common.utils.file_system import create_new_run_directory from maestro.trainer.common.utils.metrics import BaseMetric, MetricsTracker, \ display_results, save_metric_plots from maestro.trainer.common.utils.reproducibility import make_it_reproducible @@ -81,8 +81,8 @@ class TrainingConfiguration: def train(config: TrainingConfiguration) -> None: make_it_reproducible(avoid_non_deterministic_algorithms=False) - run_dir = _establish_training_run_dir( - output_dir=config.output_dir, + run_dir = create_new_run_directory( + base_output_dir=config.output_dir, ) config = replace( config, @@ -181,7 +181,7 @@ def run_training_loop( checkpoint_manager: CheckpointManager, ) -> None: train_loader, val_loader = data_loaders - optimizer = _get_optimizer(model=model, config=config) + optimizer = get_optimizer(model=model, config=config) total_steps = config.epochs * len(train_loader) lr_scheduler = get_scheduler( name=config.lr_scheduler, @@ -195,7 +195,7 @@ def run_training_loop( model=model, train_loader=train_loader, val_loader=val_loader, - epoch_number=epoch + 1, + epoch=epoch + 1, config=config, optimizer=optimizer, lr_scheduler=lr_scheduler, @@ -210,7 +210,7 @@ def run_training_epoch( model: PeftModel, train_loader: DataLoader, val_loader: Optional[DataLoader], - epoch_number: int, + epoch: int, config: TrainingConfiguration, optimizer: Optimizer, lr_scheduler: LRScheduler, @@ -220,43 +220,50 @@ def run_training_epoch( ) -> None: model.train() training_losses: List[float] = [] - training_iterator = tqdm(train_loader, desc=f"Epoch {epoch_number}/{config.epochs}") - for step_id, (inputs, answers) in enumerate(training_iterator): - input_ids = inputs["input_ids"] - pixel_values = inputs["pixel_values"] - labels = processor.tokenizer( - text=answers, return_tensors="pt", padding=True, return_token_type_ids=False - ).input_ids.to(config.device) - outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels) - loss = outputs.loss - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - loss = loss.item() - training_metrics_tracker.register( - metric="loss", - epoch=epoch_number, - step=step_id + 1, - value=loss, - ) - training_losses.append(loss) - last_100_losses = training_losses[-100:] - loss_moving_average = sum(last_100_losses) / len(last_100_losses) if len(last_100_losses) > 0 else 0.0 - training_iterator.set_description( - f"Epoch {epoch_number}/{config.epochs}. Loss: {round(loss_moving_average, 4)}" - ) - if len(training_losses) > 0: - avg_train_loss = sum(training_losses) / len(training_losses) - print(f"Average Training Loss: {avg_train_loss}") + + with tqdm(total=len(train_loader), desc=f"Epoch {epoch}/{config.epochs}", unit="batch") as pbar: + for step_id, (inputs, answers) in enumerate(train_loader): + input_ids = inputs["input_ids"] + pixel_values = inputs["pixel_values"] + labels = processor.tokenizer( + text=answers, + return_tensors="pt", + padding=True, + return_token_type_ids=False + ).input_ids.to(config.device) + outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels) + loss = outputs.loss + loss.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + loss = loss.item() + training_metrics_tracker.register( + metric="loss", + epoch=epoch, + step=step_id + 1, + value=loss, + ) + training_losses.append(loss) + + # Update progress bar + last_100_losses = training_losses[-100:] + loss_moving_average = sum(last_100_losses) / len(last_100_losses) if last_100_losses else 0.0 + pbar.set_postfix({"Loss": f"{loss_moving_average:.4f}"}) + pbar.update(1) + + # Save checkpoints based on training loss if no validation loader if val_loader is None or len(val_loader) == 0: - return None + train_loss = sum(training_losses) / len(training_losses) + checkpoint_manager.save_latest(processor, model) + checkpoint_manager.save_best(processor, model, train_loss) + return run_validation_epoch( processor=processor, model=model, loader=val_loader, - epoch_number=epoch_number, + epoch_number=epoch, config=config, metrics_tracker=validation_metrics_tracker, ) @@ -335,17 +342,7 @@ def run_validation_epoch( display_results(prompts, expected_responses, generated_texts, images) -def _establish_training_run_dir(output_dir: str) -> str: - output_dir = os.path.abspath(output_dir) - existing_directory_entries = glob(os.path.join(output_dir, "*")) - subdirectories = [path for path in existing_directory_entries if os.path.isdir(path)] - run_id = len(subdirectories) + 1 - run_dir = os.path.join(output_dir, str(run_id)) - os.makedirs(run_dir, exist_ok=True) - return run_dir - - -def _get_optimizer(model: PeftModel, config: TrainingConfiguration) -> Optimizer: +def get_optimizer(model: PeftModel, config: TrainingConfiguration) -> Optimizer: optimizer_type = config.optimizer.lower() if optimizer_type == "adamw": return AdamW(model.parameters(), lr=config.lr) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 09c2d87..fd7f7b7 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -1,18 +1,14 @@ import dataclasses from typing import Optional, Annotated +import rich import torch import typer -import rich - -from maestro.trainer.models.florence_2.entities import ( - TrainingConfiguration, - DEFAULT_FLORENCE2_MODEL_ID, - DEFAULT_FLORENCE2_MODEL_REVISION, - DEVICE, -) -from maestro.trainer.models.florence_2.training import train as train_fun +from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ + DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE +from maestro.trainer.models.florence_2.core import TrainingConfiguration +from maestro.trainer.models.florence_2.core import train as train_fun florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") diff --git a/maestro/trainer/models/florence_2/inference.py b/maestro/trainer/models/florence_2/inference.py deleted file mode 100644 index 4f0c154..0000000 --- a/maestro/trainer/models/florence_2/inference.py +++ /dev/null @@ -1,159 +0,0 @@ -from typing import Literal, Optional, Union - -import numpy as np -import supervision as sv - -from PIL import Image -from transformers import AutoProcessor, AutoModelForCausalLM - - -def caption_image( - image: Image.Image, - processor: AutoProcessor, - model: AutoModelForCausalLM, - task: Literal["", "", "", ""], - prompt: Optional[str] = None, - max_new_tokens: int = 1024, - do_sample: bool = False, - num_beams: int = 3, -) -> str: - prompt = _pre_process_prompt(image=image, task=task, prompt=prompt) - model_device = model.device - inputs = processor(text=prompt, images=image, return_tensors="pt").to(model_device) - generated_ids = model.generate( - input_ids=inputs["input_ids"], - pixel_values=inputs["pixel_values"], - max_new_tokens=max_new_tokens, - do_sample=do_sample, - num_beams=num_beams, - ) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] - response = processor.post_process_generation(generated_text, task=task, image_size=(image.width, image.height)) - return response[task] - - -TASKS_THAT_REQUIRE_PROMPT = { - "", - "", - "", - "", - "", - "", -} - - -def segment_objects( - image: Image.Image, - processor: AutoProcessor, - model: AutoModelForCausalLM, - task: Literal[ - "", - "", - ], - prompt: Optional[Union[str, tuple, list, np.ndarray]] = None, - max_new_tokens: int = 1024, - do_sample: bool = False, - num_beams: int = 3, -) -> sv.Detections: - return _prompt_and_retrieve_detections( - image=image, - processor=processor, - model=model, - task=task, - prompt=prompt, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - num_beams=num_beams, - ) - - -def detect_objects( - image: Image.Image, - processor: AutoProcessor, - model: AutoModelForCausalLM, - task: Literal[ - "", - "", - "", - "", - "", - "", - "", - ], - prompt: Optional[Union[str, tuple, list, np.ndarray]] = None, - max_new_tokens: int = 1024, - do_sample: bool = False, - num_beams: int = 3, -) -> sv.Detections: - return _prompt_and_retrieve_detections( - image=image, - processor=processor, - model=model, - task=task, - prompt=prompt, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - num_beams=num_beams, - ) - - -def _prompt_and_retrieve_detections( - image: Image.Image, - processor: AutoProcessor, - model: AutoModelForCausalLM, - task: Literal[ - "", - "", - "", - "", - "", - "", - "", - "", - "", - ], - prompt: Optional[Union[str, tuple, list, np.ndarray]] = None, - max_new_tokens: int = 1024, - do_sample: bool = False, - num_beams: int = 3, -) -> sv.Detections: - prompt = _pre_process_prompt(image=image, task=task, prompt=prompt) - model_device = model.device - inputs = processor(text=prompt, images=image, return_tensors="pt").to(model_device) - generated_ids = model.generate( - input_ids=inputs["input_ids"], - pixel_values=inputs["pixel_values"], - max_new_tokens=max_new_tokens, - do_sample=do_sample, - num_beams=num_beams, - ) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] - response = processor.post_process_generation( - generated_text, - task=task, - image_size=(image.width, image.height), - ) - return sv.Detections.from_lmm( - lmm=sv.LMM.FLORENCE_2, - result=response, - resolution_wh=image.size, - ) - - -def _pre_process_prompt( - image: Image.Image, - task: str, - prompt: Optional[Union[str, tuple, list, np.ndarray]] = None, -) -> str: - if prompt is None: - if task in TASKS_THAT_REQUIRE_PROMPT: - raise ValueError(f"Task {task} requires prompt") - return task - if isinstance(prompt, tuple) or isinstance(prompt, list) or isinstance(prompt, np.ndarray): - if len(prompt) != 4: - raise ValueError("Expected sequence of 4 elements describing (x_min, y_min, x_max, y_max)") - x_min, y_min, x_max, y_max = prompt - x_min, x_max = round((x_min / image.width) * 1000), round((x_max / image.width) * 1000) - y_min, y_max = round((y_min / image.height) * 1000), round((y_max / image.height) * 1000) - return f"{task} " - return f"{task} {prompt}" diff --git a/maestro/trainer/models/florence_2/metrics.py b/maestro/trainer/models/florence_2/metrics.py index ae01b38..117e031 100644 --- a/maestro/trainer/models/florence_2/metrics.py +++ b/maestro/trainer/models/florence_2/metrics.py @@ -50,7 +50,8 @@ def postprocess_florence2_output_for_mean_average_precision( prediction = sv.Detections.from_lmm(sv.LMM.FLORENCE_2, prediction, resolution_wh=image.size) prediction = prediction[np.isin(prediction["class_name"], classes)] prediction.class_id = np.array([classes.index(class_name) for class_name in prediction["class_name"]]) - prediction.confidence = np.ones(len(prediction)) # Set confidence for mean average precision calculation + # Set confidence for mean average precision calculation + prediction.confidence = np.ones(len(prediction)) # Postprocess target for mean average precision calculation target = processor.post_process_generation(suffix, task="", image_size=image.size) From 4a339a43c6b8c8bae89ecefdecc903981faed31a Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 17:49:40 +0200 Subject: [PATCH 04/14] initial tests of CLI mode --- maestro/cli/__init__.py | 5 + maestro/cli/env.py | 4 +- maestro/cli/introspection.py | 56 +-- maestro/cli/main.py | 18 +- maestro/cli/utils.py | 4 +- .../trainer/models/florence_2/entrypoint.py | 346 ++++++++++-------- 6 files changed, 247 insertions(+), 186 deletions(-) diff --git a/maestro/cli/__init__.py b/maestro/cli/__init__.py index e69de29..d627b18 100644 --- a/maestro/cli/__init__.py +++ b/maestro/cli/__init__.py @@ -0,0 +1,5 @@ +import typer +from maestro.trainer.models.florence_2.entrypoint import app as florence2_app + +app = typer.Typer() +app.add_typer(florence2_app, name="florence2") diff --git a/maestro/cli/env.py b/maestro/cli/env.py index b95525e..5322a5d 100644 --- a/maestro/cli/env.py +++ b/maestro/cli/env.py @@ -1,2 +1,2 @@ -DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "DISABLE_RECIPE_IMPORTS_WARNINGS" -DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "False" +# DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "DISABLE_RECIPE_IMPORTS_WARNINGS" +# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "False" diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py index fc6aef9..6223144 100644 --- a/maestro/cli/introspection.py +++ b/maestro/cli/introspection.py @@ -1,37 +1,37 @@ -import os +# import os -import typer +# import typer -from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, \ - DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV -from maestro.cli.utils import str2bool +# from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, \ +# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV +# from maestro.cli.utils import str2bool -def find_training_recipes(app: typer.Typer) -> None: - try: - from maestro.trainer.models.florence_2.entrypoint import florence_2_app +# def find_training_recipes(app: typer.Typer) -> None: +# try: +# from maestro.trainer.models.florence_2.entrypoint import florence_2_app - app.add_typer(florence_2_app, name="florence2") - except Exception: - _warn_about_recipe_import_error(model_name="Florence 2") +# app.add_typer(florence_2_app, name="florence2") +# except Exception: +# _warn_about_recipe_import_error(model_name="Florence 2") - try: - from maestro.trainer.models.paligemma.entrypoint import paligemma_app +# try: +# from maestro.trainer.models.paligemma.entrypoint import paligemma_app - app.add_typer(paligemma_app, name="paligemma") - except Exception: - _warn_about_recipe_import_error(model_name="PaliGemma") +# app.add_typer(paligemma_app, name="paligemma") +# except Exception: +# _warn_about_recipe_import_error(model_name="PaliGemma") -def _warn_about_recipe_import_error(model_name: str) -> None: - disable_warnings = str2bool( - os.getenv( - DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, - DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, - ) - ) - if disable_warnings: - return None - warning = typer.style("WARNING", fg=typer.colors.RED, bold=True) - message = "🚧 " + warning + f" cannot import recipe for {model_name}" - typer.echo(message) +# def _warn_about_recipe_import_error(model_name: str) -> None: +# disable_warnings = str2bool( +# os.getenv( +# DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, +# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, +# ) +# ) +# if disable_warnings: +# return None +# warning = typer.style("WARNING", fg=typer.colors.RED, bold=True) +# message = "🚧 " + warning + f" cannot import recipe for {model_name}" +# typer.echo(message) diff --git a/maestro/cli/main.py b/maestro/cli/main.py index b600e3a..9212e72 100644 --- a/maestro/cli/main.py +++ b/maestro/cli/main.py @@ -1,15 +1,15 @@ -import typer +# import typer -from maestro.cli.introspection import find_training_recipes +# from maestro.cli.introspection import find_training_recipes -app = typer.Typer() -find_training_recipes(app=app) +# app = typer.Typer() +# find_training_recipes(app=app) -@app.command(help="Display information about maestro") -def info(): - typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋") +# @app.command(help="Display information about maestro") +# def info(): +# typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋") -if __name__ == "__main__": - app() +# if __name__ == "__main__": +# app() diff --git a/maestro/cli/utils.py b/maestro/cli/utils.py index 0751fef..6d31bd9 100644 --- a/maestro/cli/utils.py +++ b/maestro/cli/utils.py @@ -1,2 +1,2 @@ -def str2bool(value: str) -> bool: - return value.lower() in {"y", "t", "yes", "true"} +# def str2bool(value: str) -> bool: +# return value.lower() in {"y", "t", "yes", "true"} diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index fd7f7b7..80c931f 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -1,148 +1,204 @@ -import dataclasses -from typing import Optional, Annotated +# import dataclasses +# from typing import Optional, Annotated + +# import rich +# import torch +# import typer + +# from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ +# DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE +# from maestro.trainer.models.florence_2.core import TrainingConfiguration +# from maestro.trainer.models.florence_2.core import train as train_fun + +# florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") + + +# @florence_2_app.command( +# help="Train Florence 2 model", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +# ) +# def train( +# dataset_location: Annotated[ +# str, +# typer.Option("--dataset_location", help="Path to directory with dataset"), +# ], +# model_id_or_path: Annotated[ +# str, +# typer.Option("--model_id_or_path", help="Model to be used or path to your checkpoint"), +# ] = DEFAULT_FLORENCE2_MODEL_ID, +# revision: Annotated[ +# str, +# typer.Option("--revision", help="Revision of Florence2 HF repository"), +# ] = DEFAULT_FLORENCE2_MODEL_REVISION, +# device: Annotated[ +# str, +# typer.Option("--device", help="CUDA device ID to be used (in format: 'cuda:0')"), +# ] = DEVICE, +# transformers_cache_dir: Annotated[ +# Optional[str], +# typer.Option("--transformers_cache_dir", help="Cache dir for HF weights"), +# ] = None, +# training_epochs: Annotated[ +# int, +# typer.Option("--training_epochs", help="Number of training epochs"), +# ] = 10, +# optimiser: Annotated[ +# str, +# typer.Option("--optimiser", help="Optimiser to be used"), +# ] = "adamw", +# learning_rate: Annotated[ +# float, +# typer.Option("--learning_rate", help="Learning rate"), +# ] = 1e-5, +# lr_scheduler: Annotated[ +# str, +# typer.Option("--lr_scheduler", help="LR scheduler"), +# ] = "linear", +# train_batch_size: Annotated[ +# int, +# typer.Option("--train_batch_size", help="Batch size for training"), +# ] = 4, +# test_batch_size: Annotated[ +# Optional[int], +# typer.Option( +# "--train_batch_size", help="Batch size for validation and test. If not given - train will be used." +# ), +# ] = None, +# loaders_workers: Annotated[ +# int, +# typer.Option("--loaders_workers", help="Number of loaders workers. 0 = # of CPU"), +# ] = 0, +# test_loaders_workers: Annotated[ +# Optional[int], +# typer.Option( +# "--test_loaders_workers", +# help="Number of workers for test and val loaders. If not given - train will be used.", +# ), +# ] = None, +# lora_r: Annotated[ +# int, +# typer.Option("--lora_r", help="Value of Lora R"), +# ] = 8, +# lora_alpha: Annotated[ +# int, +# typer.Option("--lora_alpha", help="Value of Lora Alpha"), +# ] = 8, +# lora_dropout: Annotated[ +# float, +# typer.Option("--lora_dropout", help="Value of Lora Dropout"), +# ] = 0.05, +# bias: Annotated[ +# str, +# typer.Option("--bias", help="Value of Lora Bias"), +# ] = "none", +# use_rslora: Annotated[ +# bool, +# typer.Option( +# "--use_rslora/--no_use_rslora", +# help="Boolean flag to decide if rslora to be used", +# ), +# ] = True, +# init_lora_weights: Annotated[ +# str, +# typer.Option("--init_lora_weights", help="Lora weights initialisation"), +# ] = "gaussian", +# training_dir: Annotated[ +# str, +# typer.Option("--training_dir", help="Path to directory where training outputs should be preserved"), +# ] = "./training/florence-2", +# max_checkpoints_to_keep: Annotated[ +# int, +# typer.Option("--max_checkpoints_to_keep", help="Max checkpoints to keep"), +# ] = 3, +# num_samples_to_visualise: Annotated[ +# int, +# typer.Option("--num_samples_to_visualise", help="Number of samples to visualise"), +# ] = 64, +# ) -> None: +# configuration = TrainingConfiguration( +# dataset_location=dataset_location, +# model_id_or_path=model_id_or_path, +# revision=revision, +# device=torch.device(device), +# transformers_cache_dir=transformers_cache_dir, +# training_epochs=training_epochs, +# optimiser=optimiser, # type: ignore +# learning_rate=learning_rate, +# lr_scheduler=lr_scheduler, # type: ignore +# train_batch_size=train_batch_size, +# test_batch_size=test_batch_size, +# loaders_workers=loaders_workers, +# test_loaders_workers=test_loaders_workers, +# lora_r=lora_r, +# lora_alpha=lora_alpha, +# lora_dropout=lora_dropout, +# bias=bias, # type: ignore +# use_rslora=use_rslora, +# init_lora_weights=init_lora_weights, # type: ignore +# training_dir=training_dir, +# max_checkpoints_to_keep=max_checkpoints_to_keep, +# num_samples_to_visualise=num_samples_to_visualise, +# ) +# typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True)) +# rich.print(dataclasses.asdict(configuration)) +# train_fun(configuration=configuration) + + +# @florence_2_app.command(help="Evaluate Florence 2 model") +# def evaluate() -> None: +# pass -import rich -import torch import typer +from typing import get_type_hints, Optional, Union, Literal +from maestro.trainer.models.florence_2.core import TrainingConfiguration, train + +app = typer.Typer() + + +def create_dynamic_cli_options(config_class): + hints = get_type_hints(config_class) + options = {} + + for field_name, field_type in hints.items(): + if field_name == 'metrics': # Skip complex types like metrics + continue + + if field_type == bool: + options[field_name] = typer.Option(None, help=f"{field_name} parameter") + elif field_type in (int, float, str): + options[field_name] = typer.Option(None, help=f"{field_name} parameter") + elif getattr(field_type, "__origin__", None) == Union: + if type(None) in field_type.__args__: + options[field_name] = typer.Option(None, help=f"{field_name} parameter") + elif getattr(field_type, "__origin__", None) == Literal: + options[field_name] = typer.Option(None, help=f"{field_name} parameter") + + return options + + +dynamic_options = create_dynamic_cli_options(TrainingConfiguration) + + +@app.command() +def florence2( + mode: str = typer.Option(..., help="Mode: 'train' or 'eval'"), + **dynamic_options +): + """Train or evaluate a Florence-2 model.""" + + # Filter out None values + config_overrides = {k: v for k, v in dynamic_options.items() if v is not None} + + # Create configuration with overrides + config = TrainingConfiguration(**config_overrides) + + if mode == "train": + train(config) + elif mode == "eval": + typer.echo("Evaluation not implemented yet.") + else: + typer.echo(f"Invalid mode: {mode}. Use 'train' or 'eval'.") + raise typer.Exit(code=1) + -from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ - DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE -from maestro.trainer.models.florence_2.core import TrainingConfiguration -from maestro.trainer.models.florence_2.core import train as train_fun - -florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") - - -@florence_2_app.command( - help="Train Florence 2 model", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} -) -def train( - dataset_location: Annotated[ - str, - typer.Option("--dataset_location", help="Path to directory with dataset"), - ], - model_id_or_path: Annotated[ - str, - typer.Option("--model_id_or_path", help="Model to be used or path to your checkpoint"), - ] = DEFAULT_FLORENCE2_MODEL_ID, - revision: Annotated[ - str, - typer.Option("--revision", help="Revision of Florence2 HF repository"), - ] = DEFAULT_FLORENCE2_MODEL_REVISION, - device: Annotated[ - str, - typer.Option("--device", help="CUDA device ID to be used (in format: 'cuda:0')"), - ] = DEVICE, - transformers_cache_dir: Annotated[ - Optional[str], - typer.Option("--transformers_cache_dir", help="Cache dir for HF weights"), - ] = None, - training_epochs: Annotated[ - int, - typer.Option("--training_epochs", help="Number of training epochs"), - ] = 10, - optimiser: Annotated[ - str, - typer.Option("--optimiser", help="Optimiser to be used"), - ] = "adamw", - learning_rate: Annotated[ - float, - typer.Option("--learning_rate", help="Learning rate"), - ] = 1e-5, - lr_scheduler: Annotated[ - str, - typer.Option("--lr_scheduler", help="LR scheduler"), - ] = "linear", - train_batch_size: Annotated[ - int, - typer.Option("--train_batch_size", help="Batch size for training"), - ] = 4, - test_batch_size: Annotated[ - Optional[int], - typer.Option( - "--train_batch_size", help="Batch size for validation and test. If not given - train will be used." - ), - ] = None, - loaders_workers: Annotated[ - int, - typer.Option("--loaders_workers", help="Number of loaders workers. 0 = # of CPU"), - ] = 0, - test_loaders_workers: Annotated[ - Optional[int], - typer.Option( - "--test_loaders_workers", - help="Number of workers for test and val loaders. If not given - train will be used.", - ), - ] = None, - lora_r: Annotated[ - int, - typer.Option("--lora_r", help="Value of Lora R"), - ] = 8, - lora_alpha: Annotated[ - int, - typer.Option("--lora_alpha", help="Value of Lora Alpha"), - ] = 8, - lora_dropout: Annotated[ - float, - typer.Option("--lora_dropout", help="Value of Lora Dropout"), - ] = 0.05, - bias: Annotated[ - str, - typer.Option("--bias", help="Value of Lora Bias"), - ] = "none", - use_rslora: Annotated[ - bool, - typer.Option( - "--use_rslora/--no_use_rslora", - help="Boolean flag to decide if rslora to be used", - ), - ] = True, - init_lora_weights: Annotated[ - str, - typer.Option("--init_lora_weights", help="Lora weights initialisation"), - ] = "gaussian", - training_dir: Annotated[ - str, - typer.Option("--training_dir", help="Path to directory where training outputs should be preserved"), - ] = "./training/florence-2", - max_checkpoints_to_keep: Annotated[ - int, - typer.Option("--max_checkpoints_to_keep", help="Max checkpoints to keep"), - ] = 3, - num_samples_to_visualise: Annotated[ - int, - typer.Option("--num_samples_to_visualise", help="Number of samples to visualise"), - ] = 64, -) -> None: - configuration = TrainingConfiguration( - dataset_location=dataset_location, - model_id_or_path=model_id_or_path, - revision=revision, - device=torch.device(device), - transformers_cache_dir=transformers_cache_dir, - training_epochs=training_epochs, - optimiser=optimiser, # type: ignore - learning_rate=learning_rate, - lr_scheduler=lr_scheduler, # type: ignore - train_batch_size=train_batch_size, - test_batch_size=test_batch_size, - loaders_workers=loaders_workers, - test_loaders_workers=test_loaders_workers, - lora_r=lora_r, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - bias=bias, # type: ignore - use_rslora=use_rslora, - init_lora_weights=init_lora_weights, # type: ignore - training_dir=training_dir, - max_checkpoints_to_keep=max_checkpoints_to_keep, - num_samples_to_visualise=num_samples_to_visualise, - ) - typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True)) - rich.print(dataclasses.asdict(configuration)) - train_fun(configuration=configuration) - - -@florence_2_app.command(help="Evaluate Florence 2 model") -def evaluate() -> None: - pass +if __name__ == "__main__": + app() From c7c63b7ffb92557132c032598ea3b81cf6075d82 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 18:30:16 +0200 Subject: [PATCH 05/14] fix --- maestro/cli/__init__.py | 4 ---- maestro/cli/main.py | 9 +++++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/maestro/cli/__init__.py b/maestro/cli/__init__.py index d627b18..8b13789 100644 --- a/maestro/cli/__init__.py +++ b/maestro/cli/__init__.py @@ -1,5 +1 @@ -import typer -from maestro.trainer.models.florence_2.entrypoint import app as florence2_app -app = typer.Typer() -app.add_typer(florence2_app, name="florence2") diff --git a/maestro/cli/main.py b/maestro/cli/main.py index 9212e72..5481b24 100644 --- a/maestro/cli/main.py +++ b/maestro/cli/main.py @@ -13,3 +13,12 @@ # if __name__ == "__main__": # app() + +import typer +from maestro.trainer.models.florence_2.entrypoint import app as florence2_app + +app = typer.Typer() +app.add_typer(florence2_app, name="florence2") + +if __name__ == "__main__": + app() From 5cc422014ea9746bde67b1d1434dcc4c3ffdd5fe Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:15:13 +0200 Subject: [PATCH 06/14] fix `No such option: --mode ` --- .../trainer/models/florence_2/entrypoint.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 80c931f..284ebb6 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -148,8 +148,8 @@ # pass import typer -from typing import get_type_hints, Optional, Union, Literal -from maestro.trainer.models.florence_2.core import TrainingConfiguration, train +from typing import get_type_hints, Union, Literal +from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 app = typer.Typer() @@ -179,25 +179,21 @@ def create_dynamic_cli_options(config_class): @app.command() -def florence2( - mode: str = typer.Option(..., help="Mode: 'train' or 'eval'"), - **dynamic_options -): - """Train or evaluate a Florence-2 model.""" - +def train(**dynamic_options): + """Train a Florence-2 model.""" # Filter out None values config_overrides = {k: v for k, v in dynamic_options.items() if v is not None} # Create configuration with overrides config = TrainingConfiguration(**config_overrides) - if mode == "train": - train(config) - elif mode == "eval": - typer.echo("Evaluation not implemented yet.") - else: - typer.echo(f"Invalid mode: {mode}. Use 'train' or 'eval'.") - raise typer.Exit(code=1) + train_florence2(config) + + +@app.command() +def evaluate(**dynamic_options): + """Evaluate a Florence-2 model.""" + typer.echo("Evaluation not implemented yet.") if __name__ == "__main__": From 518323c2a6731a665ac83ba60491d09ec54bb7d2 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:21:03 +0200 Subject: [PATCH 07/14] fix 2 `No such option: --mode ` --- .../trainer/models/florence_2/entrypoint.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 284ebb6..da811e6 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -153,7 +153,6 @@ app = typer.Typer() - def create_dynamic_cli_options(config_class): hints = get_type_hints(config_class) options = {} @@ -174,11 +173,19 @@ def create_dynamic_cli_options(config_class): return options - dynamic_options = create_dynamic_cli_options(TrainingConfiguration) - @app.command() +def main(mode: str, **dynamic_options): + """Main entry point for Florence-2 model.""" + if mode == "train": + train(**dynamic_options) + elif mode == "evaluate": + evaluate(**dynamic_options) + else: + typer.echo(f"Unknown mode: {mode}") + raise typer.Exit(code=1) + def train(**dynamic_options): """Train a Florence-2 model.""" # Filter out None values @@ -189,12 +196,9 @@ def train(**dynamic_options): train_florence2(config) - -@app.command() def evaluate(**dynamic_options): """Evaluate a Florence-2 model.""" typer.echo("Evaluation not implemented yet.") - if __name__ == "__main__": - app() + app() \ No newline at end of file From fb212ead9e03c0842e3a83d0bd3bfe8d1d69f877 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:39:17 +0200 Subject: [PATCH 08/14] fix 3 `No such option: --mode ` --- .../trainer/models/florence_2/entrypoint.py | 81 ++++++++++++------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index da811e6..3778f72 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -148,55 +148,78 @@ # pass import typer -from typing import get_type_hints, Union, Literal +from typing import Optional, List, Union, Literal from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 app = typer.Typer() -def create_dynamic_cli_options(config_class): - hints = get_type_hints(config_class) - options = {} - - for field_name, field_type in hints.items(): - if field_name == 'metrics': # Skip complex types like metrics - continue - - if field_type == bool: - options[field_name] = typer.Option(None, help=f"{field_name} parameter") - elif field_type in (int, float, str): - options[field_name] = typer.Option(None, help=f"{field_name} parameter") - elif getattr(field_type, "__origin__", None) == Union: - if type(None) in field_type.__args__: - options[field_name] = typer.Option(None, help=f"{field_name} parameter") - elif getattr(field_type, "__origin__", None) == Literal: - options[field_name] = typer.Option(None, help=f"{field_name} parameter") - - return options - -dynamic_options = create_dynamic_cli_options(TrainingConfiguration) - @app.command() -def main(mode: str, **dynamic_options): +def main( + mode: str = typer.Option(..., help="Mode to run: train or evaluate"), + dataset_path: str = typer.Option(..., help="Path to the dataset used for training"), + model_id: str = typer.Option(None, help="Identifier for the Florence-2 model"), + revision: str = typer.Option(None, help="Revision of the model to use"), + device: str = typer.Option(None, help="Device to use for training"), + cache_dir: Optional[str] = typer.Option(None, help="Directory to cache the model"), + epochs: int = typer.Option(10, help="Number of training epochs"), + optimizer: str = typer.Option("adamw", help="Optimizer to use for training"), + lr: float = typer.Option(1e-5, help="Learning rate for the optimizer"), + lr_scheduler: str = typer.Option("linear", help="Learning rate scheduler"), + batch_size: int = typer.Option(4, help="Batch size for training"), + val_batch_size: Optional[int] = typer.Option(None, help="Batch size for validation"), + num_workers: int = typer.Option(0, help="Number of workers for data loading"), + val_num_workers: Optional[int] = typer.Option(None, help="Number of workers for validation data loading"), + lora_r: int = typer.Option(8, help="Rank of the LoRA update matrices"), + lora_alpha: int = typer.Option(8, help="Scaling factor for the LoRA update"), + lora_dropout: float = typer.Option(0.05, help="Dropout probability for LoRA layers"), + bias: str = typer.Option("none", help="Which bias to train"), + use_rslora: bool = typer.Option(True, help="Whether to use RSLoRA"), + init_lora_weights: str = typer.Option("gaussian", help="How to initialize LoRA weights"), + output_dir: str = typer.Option("./training/florence-2", help="Directory to save output files"), + metrics: List[str] = typer.Option([], help="List of metrics to track during training") +): """Main entry point for Florence-2 model.""" if mode == "train": - train(**dynamic_options) + train( + dataset_path=dataset_path, + model_id=model_id, + revision=revision, + device=device, + cache_dir=cache_dir, + epochs=epochs, + optimizer=optimizer, + lr=lr, + lr_scheduler=lr_scheduler, + batch_size=batch_size, + val_batch_size=val_batch_size, + num_workers=num_workers, + val_num_workers=val_num_workers, + lora_r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + bias=bias, + use_rslora=use_rslora, + init_lora_weights=init_lora_weights, + output_dir=output_dir, + metrics=metrics + ) elif mode == "evaluate": - evaluate(**dynamic_options) + evaluate() else: typer.echo(f"Unknown mode: {mode}") raise typer.Exit(code=1) -def train(**dynamic_options): +def train(**kwargs): """Train a Florence-2 model.""" # Filter out None values - config_overrides = {k: v for k, v in dynamic_options.items() if v is not None} + config_overrides = {k: v for k, v in kwargs.items() if v is not None} # Create configuration with overrides config = TrainingConfiguration(**config_overrides) train_florence2(config) -def evaluate(**dynamic_options): +def evaluate(): """Evaluate a Florence-2 model.""" typer.echo("Evaluation not implemented yet.") From f15b7a9134b2a9befc76ed31f184cdd236a6f59c Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:45:30 +0200 Subject: [PATCH 09/14] fix 4 `No such option: --mode ` --- maestro/trainer/models/florence_2/entrypoint.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 3778f72..92f83d5 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -148,12 +148,13 @@ # pass import typer -from typing import Optional, List, Union, Literal +from typing import Optional, List from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 app = typer.Typer() -@app.command() + +@app.callback() def main( mode: str = typer.Option(..., help="Mode to run: train or evaluate"), dataset_path: str = typer.Option(..., help="Path to the dataset used for training"), @@ -209,6 +210,7 @@ def main( typer.echo(f"Unknown mode: {mode}") raise typer.Exit(code=1) + def train(**kwargs): """Train a Florence-2 model.""" # Filter out None values @@ -219,9 +221,11 @@ def train(**kwargs): train_florence2(config) + def evaluate(): """Evaluate a Florence-2 model.""" typer.echo("Evaluation not implemented yet.") + if __name__ == "__main__": app() \ No newline at end of file From 566d9ca773295f3d6711c197135a0106488bf8ab Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:51:54 +0200 Subject: [PATCH 10/14] fix 5 `No such option: --mode ` --- maestro/cli/main.py | 2 +- maestro/trainer/models/florence_2/entrypoint.py | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/maestro/cli/main.py b/maestro/cli/main.py index 5481b24..3ef8500 100644 --- a/maestro/cli/main.py +++ b/maestro/cli/main.py @@ -15,7 +15,7 @@ # app() import typer -from maestro.trainer.models.florence_2.entrypoint import app as florence2_app +from maestro.trainer.models.florence_2.entrypoint import florence2_app app = typer.Typer() app.add_typer(florence2_app, name="florence2") diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 92f83d5..31ea38c 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -148,13 +148,12 @@ # pass import typer -from typing import Optional, List +from typing import Optional, List, Union, Literal from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 -app = typer.Typer() +florence2_app = typer.Typer() - -@app.callback() +@florence2_app.command() def main( mode: str = typer.Option(..., help="Mode to run: train or evaluate"), dataset_path: str = typer.Option(..., help="Path to the dataset used for training"), @@ -210,7 +209,6 @@ def main( typer.echo(f"Unknown mode: {mode}") raise typer.Exit(code=1) - def train(**kwargs): """Train a Florence-2 model.""" # Filter out None values @@ -221,11 +219,6 @@ def train(**kwargs): train_florence2(config) - def evaluate(): """Evaluate a Florence-2 model.""" typer.echo("Evaluation not implemented yet.") - - -if __name__ == "__main__": - app() \ No newline at end of file From fb1c8268a66b78c68c2942dc4711e304e169915a Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 10 Sep 2024 19:57:14 +0200 Subject: [PATCH 11/14] fix 6 `No such option: --mode ` --- maestro/cli/main.py | 6 ++++-- maestro/trainer/models/florence_2/entrypoint.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/maestro/cli/main.py b/maestro/cli/main.py index 3ef8500..8e2edc9 100644 --- a/maestro/cli/main.py +++ b/maestro/cli/main.py @@ -15,10 +15,12 @@ # app() import typer -from maestro.trainer.models.florence_2.entrypoint import florence2_app +from maestro.trainer.models.florence_2.entrypoint import florence2 app = typer.Typer() -app.add_typer(florence2_app, name="florence2") + +# Add the florence2 command to the main app +app.command()(florence2) if __name__ == "__main__": app() diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 31ea38c..30204ce 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -151,10 +151,10 @@ from typing import Optional, List, Union, Literal from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 -florence2_app = typer.Typer() +app = typer.Typer() -@florence2_app.command() -def main( +@app.command() +def florence2( mode: str = typer.Option(..., help="Mode to run: train or evaluate"), dataset_path: str = typer.Option(..., help="Path to the dataset used for training"), model_id: str = typer.Option(None, help="Identifier for the Florence-2 model"), @@ -222,3 +222,6 @@ def train(**kwargs): def evaluate(): """Evaluate a Florence-2 model.""" typer.echo("Evaluation not implemented yet.") + +if __name__ == "__main__": + app() From d556a884e4ca7da9d7cdd29984a9e2e9bb243aee Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 11 Sep 2024 00:01:21 +0200 Subject: [PATCH 12/14] bring back Pawel's code with improvements --- maestro/cli/env.py | 4 +- maestro/cli/introspection.py | 56 +-- maestro/cli/main.py | 25 +- maestro/cli/utils.py | 4 +- maestro/trainer/models/florence_2/core.py | 45 +-- .../trainer/models/florence_2/entrypoint.py | 353 +++++++----------- 6 files changed, 196 insertions(+), 291 deletions(-) diff --git a/maestro/cli/env.py b/maestro/cli/env.py index 5322a5d..b95525e 100644 --- a/maestro/cli/env.py +++ b/maestro/cli/env.py @@ -1,2 +1,2 @@ -# DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "DISABLE_RECIPE_IMPORTS_WARNINGS" -# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "False" +DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "DISABLE_RECIPE_IMPORTS_WARNINGS" +DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV = "False" diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py index 6223144..fc6aef9 100644 --- a/maestro/cli/introspection.py +++ b/maestro/cli/introspection.py @@ -1,37 +1,37 @@ -# import os +import os -# import typer +import typer -# from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, \ -# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV -# from maestro.cli.utils import str2bool +from maestro.cli.env import DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, \ + DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV +from maestro.cli.utils import str2bool -# def find_training_recipes(app: typer.Typer) -> None: -# try: -# from maestro.trainer.models.florence_2.entrypoint import florence_2_app +def find_training_recipes(app: typer.Typer) -> None: + try: + from maestro.trainer.models.florence_2.entrypoint import florence_2_app -# app.add_typer(florence_2_app, name="florence2") -# except Exception: -# _warn_about_recipe_import_error(model_name="Florence 2") + app.add_typer(florence_2_app, name="florence2") + except Exception: + _warn_about_recipe_import_error(model_name="Florence 2") -# try: -# from maestro.trainer.models.paligemma.entrypoint import paligemma_app + try: + from maestro.trainer.models.paligemma.entrypoint import paligemma_app -# app.add_typer(paligemma_app, name="paligemma") -# except Exception: -# _warn_about_recipe_import_error(model_name="PaliGemma") + app.add_typer(paligemma_app, name="paligemma") + except Exception: + _warn_about_recipe_import_error(model_name="PaliGemma") -# def _warn_about_recipe_import_error(model_name: str) -> None: -# disable_warnings = str2bool( -# os.getenv( -# DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, -# DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, -# ) -# ) -# if disable_warnings: -# return None -# warning = typer.style("WARNING", fg=typer.colors.RED, bold=True) -# message = "🚧 " + warning + f" cannot import recipe for {model_name}" -# typer.echo(message) +def _warn_about_recipe_import_error(model_name: str) -> None: + disable_warnings = str2bool( + os.getenv( + DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, + DEFAULT_DISABLE_RECIPE_IMPORTS_WARNINGS_ENV, + ) + ) + if disable_warnings: + return None + warning = typer.style("WARNING", fg=typer.colors.RED, bold=True) + message = "🚧 " + warning + f" cannot import recipe for {model_name}" + typer.echo(message) diff --git a/maestro/cli/main.py b/maestro/cli/main.py index 8e2edc9..b600e3a 100644 --- a/maestro/cli/main.py +++ b/maestro/cli/main.py @@ -1,26 +1,15 @@ -# import typer - -# from maestro.cli.introspection import find_training_recipes - -# app = typer.Typer() -# find_training_recipes(app=app) - - -# @app.command(help="Display information about maestro") -# def info(): -# typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋") +import typer +from maestro.cli.introspection import find_training_recipes -# if __name__ == "__main__": -# app() +app = typer.Typer() +find_training_recipes(app=app) -import typer -from maestro.trainer.models.florence_2.entrypoint import florence2 -app = typer.Typer() +@app.command(help="Display information about maestro") +def info(): + typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋") -# Add the florence2 command to the main app -app.command()(florence2) if __name__ == "__main__": app() diff --git a/maestro/cli/utils.py b/maestro/cli/utils.py index 6d31bd9..0751fef 100644 --- a/maestro/cli/utils.py +++ b/maestro/cli/utils.py @@ -1,2 +1,2 @@ -# def str2bool(value: str) -> bool: -# return value.lower() in {"y", "t", "yes", "true"} +def str2bool(value: str) -> bool: + return value.lower() in {"y", "t", "yes", "true"} diff --git a/maestro/trainer/models/florence_2/core.py b/maestro/trainer/models/florence_2/core.py index 46ea75b..c4ca286 100644 --- a/maestro/trainer/models/florence_2/core.py +++ b/maestro/trainer/models/florence_2/core.py @@ -31,30 +31,33 @@ class TrainingConfiguration: """Configuration for training a Florence-2 model. This class encapsulates all the parameters needed for training a Florence-2 model, - including dataset paths, model specifications, training hyperparameters, and output settings. + including dataset paths, model specifications, training hyperparameters, and output + settings. Attributes: dataset_path (str): Path to the dataset used for training. - model_id (str): Identifier for the Florence-2 model. Defaults to DEFAULT_FLORENCE2_MODEL_ID. - revision (str): Revision of the model to use. Defaults to DEFAULT_FLORENCE2_MODEL_REVISION. - device (torch.device): Device to use for training. Defaults to DEVICE. - cache_dir (Optional[str]): Directory to cache the model. Defaults to None. - epochs (int): Number of training epochs. Defaults to 10. - optimizer (Literal["sgd", "adamw", "adam"]): Optimizer to use for training. Defaults to "adamw". - lr (float): Learning rate for the optimizer. Defaults to 1e-5. - lr_scheduler (Literal["linear", "cosine", "polynomial"]): Learning rate scheduler. Defaults to "linear". - batch_size (int): Batch size for training. Defaults to 4. - val_batch_size (Optional[int]): Batch size for validation. Defaults to None. - num_workers (int): Number of workers for data loading. Defaults to 0. - val_num_workers (Optional[int]): Number of workers for validation data loading. Defaults to None. - lora_r (int): Rank of the LoRA update matrices. Defaults to 8. - lora_alpha (int): Scaling factor for the LoRA update. Defaults to 8. - lora_dropout (float): Dropout probability for LoRA layers. Defaults to 0.05. - bias (Literal["none", "all", "lora_only"]): Which bias to train. Defaults to "none". - use_rslora (bool): Whether to use RSLoRA. Defaults to True. - init_lora_weights (Union[bool, LoraInitLiteral]): How to initialize LoRA weights. Defaults to "gaussian". - output_dir (str): Directory to save output files. Defaults to "./training/florence-2". - metrics (List[BaseMetric]): List of metrics to track during training. Defaults to an empty list. + model_id (str): Identifier for the Florence-2 model. + revision (str): Revision of the model to use. + device (torch.device): Device to use for training. + cache_dir (Optional[str]): Directory to cache the model. + epochs (int): Number of training epochs. + optimizer (Literal["sgd", "adamw", "adam"]): Optimizer to use for training. + lr (float): Learning rate for the optimizer. + lr_scheduler (Literal["linear", "cosine", "polynomial"]): Learning rate + scheduler. + batch_size (int): Batch size for training. + val_batch_size (Optional[int]): Batch size for validation. + num_workers (int): Number of workers for data loading. + val_num_workers (Optional[int]): Number of workers for validation data loading. + lora_r (int): Rank of the LoRA update matrices. + lora_alpha (int): Scaling factor for the LoRA update. + lora_dropout (float): Dropout probability for LoRA layers. + bias (Literal["none", "all", "lora_only"]): Which bias to train. + use_rslora (bool): Whether to use RSLoRA. + init_lora_weights (Union[bool, LoraInitLiteral]): How to initialize LoRA + weights. + output_dir (str): Directory to save output files. + metrics (List[BaseMetric]): List of metrics to track during training. """ dataset_path: str model_id: str = DEFAULT_FLORENCE2_MODEL_ID diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 30204ce..acab19e 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -1,227 +1,140 @@ -# import dataclasses -# from typing import Optional, Annotated - -# import rich -# import torch -# import typer - -# from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ -# DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE -# from maestro.trainer.models.florence_2.core import TrainingConfiguration -# from maestro.trainer.models.florence_2.core import train as train_fun - -# florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") - - -# @florence_2_app.command( -# help="Train Florence 2 model", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} -# ) -# def train( -# dataset_location: Annotated[ -# str, -# typer.Option("--dataset_location", help="Path to directory with dataset"), -# ], -# model_id_or_path: Annotated[ -# str, -# typer.Option("--model_id_or_path", help="Model to be used or path to your checkpoint"), -# ] = DEFAULT_FLORENCE2_MODEL_ID, -# revision: Annotated[ -# str, -# typer.Option("--revision", help="Revision of Florence2 HF repository"), -# ] = DEFAULT_FLORENCE2_MODEL_REVISION, -# device: Annotated[ -# str, -# typer.Option("--device", help="CUDA device ID to be used (in format: 'cuda:0')"), -# ] = DEVICE, -# transformers_cache_dir: Annotated[ -# Optional[str], -# typer.Option("--transformers_cache_dir", help="Cache dir for HF weights"), -# ] = None, -# training_epochs: Annotated[ -# int, -# typer.Option("--training_epochs", help="Number of training epochs"), -# ] = 10, -# optimiser: Annotated[ -# str, -# typer.Option("--optimiser", help="Optimiser to be used"), -# ] = "adamw", -# learning_rate: Annotated[ -# float, -# typer.Option("--learning_rate", help="Learning rate"), -# ] = 1e-5, -# lr_scheduler: Annotated[ -# str, -# typer.Option("--lr_scheduler", help="LR scheduler"), -# ] = "linear", -# train_batch_size: Annotated[ -# int, -# typer.Option("--train_batch_size", help="Batch size for training"), -# ] = 4, -# test_batch_size: Annotated[ -# Optional[int], -# typer.Option( -# "--train_batch_size", help="Batch size for validation and test. If not given - train will be used." -# ), -# ] = None, -# loaders_workers: Annotated[ -# int, -# typer.Option("--loaders_workers", help="Number of loaders workers. 0 = # of CPU"), -# ] = 0, -# test_loaders_workers: Annotated[ -# Optional[int], -# typer.Option( -# "--test_loaders_workers", -# help="Number of workers for test and val loaders. If not given - train will be used.", -# ), -# ] = None, -# lora_r: Annotated[ -# int, -# typer.Option("--lora_r", help="Value of Lora R"), -# ] = 8, -# lora_alpha: Annotated[ -# int, -# typer.Option("--lora_alpha", help="Value of Lora Alpha"), -# ] = 8, -# lora_dropout: Annotated[ -# float, -# typer.Option("--lora_dropout", help="Value of Lora Dropout"), -# ] = 0.05, -# bias: Annotated[ -# str, -# typer.Option("--bias", help="Value of Lora Bias"), -# ] = "none", -# use_rslora: Annotated[ -# bool, -# typer.Option( -# "--use_rslora/--no_use_rslora", -# help="Boolean flag to decide if rslora to be used", -# ), -# ] = True, -# init_lora_weights: Annotated[ -# str, -# typer.Option("--init_lora_weights", help="Lora weights initialisation"), -# ] = "gaussian", -# training_dir: Annotated[ -# str, -# typer.Option("--training_dir", help="Path to directory where training outputs should be preserved"), -# ] = "./training/florence-2", -# max_checkpoints_to_keep: Annotated[ -# int, -# typer.Option("--max_checkpoints_to_keep", help="Max checkpoints to keep"), -# ] = 3, -# num_samples_to_visualise: Annotated[ -# int, -# typer.Option("--num_samples_to_visualise", help="Number of samples to visualise"), -# ] = 64, -# ) -> None: -# configuration = TrainingConfiguration( -# dataset_location=dataset_location, -# model_id_or_path=model_id_or_path, -# revision=revision, -# device=torch.device(device), -# transformers_cache_dir=transformers_cache_dir, -# training_epochs=training_epochs, -# optimiser=optimiser, # type: ignore -# learning_rate=learning_rate, -# lr_scheduler=lr_scheduler, # type: ignore -# train_batch_size=train_batch_size, -# test_batch_size=test_batch_size, -# loaders_workers=loaders_workers, -# test_loaders_workers=test_loaders_workers, -# lora_r=lora_r, -# lora_alpha=lora_alpha, -# lora_dropout=lora_dropout, -# bias=bias, # type: ignore -# use_rslora=use_rslora, -# init_lora_weights=init_lora_weights, # type: ignore -# training_dir=training_dir, -# max_checkpoints_to_keep=max_checkpoints_to_keep, -# num_samples_to_visualise=num_samples_to_visualise, -# ) -# typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True)) -# rich.print(dataclasses.asdict(configuration)) -# train_fun(configuration=configuration) - - -# @florence_2_app.command(help="Evaluate Florence 2 model") -# def evaluate() -> None: -# pass +import dataclasses +from typing import Optional, Annotated, Literal, Union +import rich +import torch import typer -from typing import Optional, List, Union, Literal -from maestro.trainer.models.florence_2.core import TrainingConfiguration, train as train_florence2 -app = typer.Typer() +from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ + DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE +from maestro.trainer.models.florence_2.core import TrainingConfiguration, \ + LoraInitLiteral +from maestro.trainer.models.florence_2.core import train as train_fun + +florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") -@app.command() -def florence2( - mode: str = typer.Option(..., help="Mode to run: train or evaluate"), - dataset_path: str = typer.Option(..., help="Path to the dataset used for training"), - model_id: str = typer.Option(None, help="Identifier for the Florence-2 model"), - revision: str = typer.Option(None, help="Revision of the model to use"), - device: str = typer.Option(None, help="Device to use for training"), - cache_dir: Optional[str] = typer.Option(None, help="Directory to cache the model"), - epochs: int = typer.Option(10, help="Number of training epochs"), - optimizer: str = typer.Option("adamw", help="Optimizer to use for training"), - lr: float = typer.Option(1e-5, help="Learning rate for the optimizer"), - lr_scheduler: str = typer.Option("linear", help="Learning rate scheduler"), - batch_size: int = typer.Option(4, help="Batch size for training"), - val_batch_size: Optional[int] = typer.Option(None, help="Batch size for validation"), - num_workers: int = typer.Option(0, help="Number of workers for data loading"), - val_num_workers: Optional[int] = typer.Option(None, help="Number of workers for validation data loading"), - lora_r: int = typer.Option(8, help="Rank of the LoRA update matrices"), - lora_alpha: int = typer.Option(8, help="Scaling factor for the LoRA update"), - lora_dropout: float = typer.Option(0.05, help="Dropout probability for LoRA layers"), - bias: str = typer.Option("none", help="Which bias to train"), - use_rslora: bool = typer.Option(True, help="Whether to use RSLoRA"), - init_lora_weights: str = typer.Option("gaussian", help="How to initialize LoRA weights"), - output_dir: str = typer.Option("./training/florence-2", help="Directory to save output files"), - metrics: List[str] = typer.Option([], help="List of metrics to track during training") -): - """Main entry point for Florence-2 model.""" - if mode == "train": - train( - dataset_path=dataset_path, - model_id=model_id, - revision=revision, - device=device, - cache_dir=cache_dir, - epochs=epochs, - optimizer=optimizer, - lr=lr, - lr_scheduler=lr_scheduler, - batch_size=batch_size, - val_batch_size=val_batch_size, - num_workers=num_workers, - val_num_workers=val_num_workers, - lora_r=lora_r, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - bias=bias, - use_rslora=use_rslora, - init_lora_weights=init_lora_weights, - output_dir=output_dir, - metrics=metrics - ) - elif mode == "evaluate": - evaluate() - else: - typer.echo(f"Unknown mode: {mode}") - raise typer.Exit(code=1) -def train(**kwargs): - """Train a Florence-2 model.""" - # Filter out None values - config_overrides = {k: v for k, v in kwargs.items() if v is not None} - - # Create configuration with overrides - config = TrainingConfiguration(**config_overrides) - - train_florence2(config) +@florence_2_app.command( + help="Train Florence 2 model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +) +def train( + dataset: Annotated[ + str, + typer.Option("--dataset", help="Path to the dataset used for training"), + ], + model_id: Annotated[ + str, + typer.Option("--model_id", help="Identifier for the Florence-2 model"), + ] = DEFAULT_FLORENCE2_MODEL_ID, + revision: Annotated[ + str, + typer.Option("--revision", help="Revision of the model to use"), + ] = DEFAULT_FLORENCE2_MODEL_REVISION, + device: Annotated[ + str, + typer.Option("--device", help="Device to use for training"), + ] = DEVICE, + cache_dir: Annotated[ + Optional[str], + typer.Option("--cache_dir", help="Directory to cache the model"), + ] = None, + epochs: Annotated[ + int, + typer.Option("--epochs", help="Number of training epochs"), + ] = 10, + optimizer: Annotated[ + Literal["sgd", "adamw", "adam"], + typer.Option("--optimizer", help="Optimizer to use for training"), + ] = "adamw", + lr: Annotated[ + float, + typer.Option("--lr", help="Learning rate for the optimizer"), + ] = 1e-5, + lr_scheduler: Annotated[ + Literal["linear", "cosine", "polynomial"], + typer.Option("--lr_scheduler", help="Learning rate scheduler"), + ] = "linear", + batch_size: Annotated[ + int, + typer.Option("--batch_size", help="Batch size for training"), + ] = 4, + val_batch_size: Annotated[ + Optional[int], + typer.Option("--val_batch_size", help="Batch size for validation"), + ] = None, + num_workers: Annotated[ + int, + typer.Option("--num_workers", help="Number of workers for data loading"), + ] = 0, + val_num_workers: Annotated[ + Optional[int], + typer.Option("--val_num_workers", help="Number of workers for validation data loading"), + ] = None, + lora_r: Annotated[ + int, + typer.Option("--lora_r", help="Rank of the LoRA update matrices"), + ] = 8, + lora_alpha: Annotated[ + int, + typer.Option("--lora_alpha", help="Scaling factor for the LoRA update"), + ] = 8, + lora_dropout: Annotated[ + float, + typer.Option("--lora_dropout", help="Dropout probability for LoRA layers"), + ] = 0.05, + bias: Annotated[ + Literal["none", "all", "lora_only"], + typer.Option("--bias", help="Which bias to train"), + ] = "none", + use_rslora: Annotated[ + bool, + typer.Option("--use_rslora/--no_use_rslora", help="Whether to use RSLoRA"), + ] = True, + init_lora_weights: Annotated[ + Union[bool, LoraInitLiteral], + typer.Option("--init_lora_weights", help="How to initialize LoRA weights"), + ] = "gaussian", + output_dir: Annotated[ + str, + typer.Option("--output_dir", help="Directory to save output files"), + ] = "./training/florence-2", +) -> None: + config = TrainingConfiguration( + dataset=dataset, + model_id=model_id, + revision=revision, + device=torch.device(device), + cache_dir=cache_dir, + epochs=epochs, + optimizer=optimizer, + lr=lr, + lr_scheduler=lr_scheduler, + batch_size=batch_size, + val_batch_size=val_batch_size, + num_workers=num_workers, + val_num_workers=val_num_workers, + lora_r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + bias=bias, + use_rslora=use_rslora, + init_lora_weights=init_lora_weights, + output_dir=output_dir + ) + typer.echo(typer.style( + text="Training configuration", + fg=typer.colors.BRIGHT_GREEN, + bold=True + )) + rich.print(dataclasses.asdict(config)) + train_fun(config=config) -def evaluate(): - """Evaluate a Florence-2 model.""" - typer.echo("Evaluation not implemented yet.") -if __name__ == "__main__": - app() +@florence_2_app.command(help="Evaluate Florence 2 model") +def evaluate() -> None: + typer.echo(typer.style( + "Evaluation command for Florence 2 is not yet implemented.", + fg=typer.colors.YELLOW, + bold=True + )) From f46049ed9a780f8dcca87d4b97de831e661d89bc Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 11 Sep 2024 00:14:02 +0200 Subject: [PATCH 13/14] remove Literal from command definitions --- maestro/trainer/models/florence_2/core.py | 6 +++--- maestro/trainer/models/florence_2/entrypoint.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/maestro/trainer/models/florence_2/core.py b/maestro/trainer/models/florence_2/core.py index c4ca286..068199d 100644 --- a/maestro/trainer/models/florence_2/core.py +++ b/maestro/trainer/models/florence_2/core.py @@ -35,7 +35,7 @@ class TrainingConfiguration: settings. Attributes: - dataset_path (str): Path to the dataset used for training. + dataset (str): Path to the dataset used for training. model_id (str): Identifier for the Florence-2 model. revision (str): Revision of the model to use. device (torch.device): Device to use for training. @@ -59,7 +59,7 @@ class TrainingConfiguration: output_dir (str): Directory to save output files. metrics (List[BaseMetric]): List of metrics to track during training. """ - dataset_path: str + dataset: str model_id: str = DEFAULT_FLORENCE2_MODEL_ID revision: str = DEFAULT_FLORENCE2_MODEL_REVISION device: torch.device = DEVICE @@ -100,7 +100,7 @@ def train(config: TrainingConfiguration) -> None: cache_dir=config.cache_dir, ) train_loader, val_loader, test_loader = prepare_data_loaders( - dataset_location=config.dataset_path, + dataset_location=config.dataset, train_batch_size=config.batch_size, processor=processor, device=config.device, diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index acab19e..9310046 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -44,7 +44,7 @@ def train( typer.Option("--epochs", help="Number of training epochs"), ] = 10, optimizer: Annotated[ - Literal["sgd", "adamw", "adam"], + str, typer.Option("--optimizer", help="Optimizer to use for training"), ] = "adamw", lr: Annotated[ @@ -52,7 +52,7 @@ def train( typer.Option("--lr", help="Learning rate for the optimizer"), ] = 1e-5, lr_scheduler: Annotated[ - Literal["linear", "cosine", "polynomial"], + str, typer.Option("--lr_scheduler", help="Learning rate scheduler"), ] = "linear", batch_size: Annotated[ @@ -84,7 +84,7 @@ def train( typer.Option("--lora_dropout", help="Dropout probability for LoRA layers"), ] = 0.05, bias: Annotated[ - Literal["none", "all", "lora_only"], + str, typer.Option("--bias", help="Which bias to train"), ] = "none", use_rslora: Annotated[ @@ -92,7 +92,7 @@ def train( typer.Option("--use_rslora/--no_use_rslora", help="Whether to use RSLoRA"), ] = True, init_lora_weights: Annotated[ - Union[bool, LoraInitLiteral], + Union[bool, str], typer.Option("--init_lora_weights", help="How to initialize LoRA weights"), ] = "gaussian", output_dir: Annotated[ From a2850ac9d45fb0282266f2a3d6c4394576659f11 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 11 Sep 2024 00:26:06 +0200 Subject: [PATCH 14/14] remove Union from command definitions --- maestro/trainer/models/florence_2/entrypoint.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index 9310046..a686350 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -1,5 +1,5 @@ import dataclasses -from typing import Optional, Annotated, Literal, Union +from typing import Optional, Annotated import rich import torch @@ -7,8 +7,7 @@ from maestro.trainer.models.florence_2.checkpoints import DEFAULT_FLORENCE2_MODEL_ID, \ DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE -from maestro.trainer.models.florence_2.core import TrainingConfiguration, \ - LoraInitLiteral +from maestro.trainer.models.florence_2.core import TrainingConfiguration from maestro.trainer.models.florence_2.core import train as train_fun florence_2_app = typer.Typer(help="Fine-tune and evaluate Florence 2 model") @@ -92,7 +91,7 @@ def train( typer.Option("--use_rslora/--no_use_rslora", help="Whether to use RSLoRA"), ] = True, init_lora_weights: Annotated[ - Union[bool, str], + str, typer.Option("--init_lora_weights", help="How to initialize LoRA weights"), ] = "gaussian", output_dir: Annotated[