Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Learner test build #75

Merged
merged 13 commits into from
Jan 31, 2025
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions src/stimulus/learner/raytune_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from safetensors.torch import save_model as safe_save_model
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import ray

from stimulus.data.handlertorch import TorchDataset
from stimulus.data.loaders import EncoderLoader
Expand Down Expand Up @@ -111,7 +112,7 @@ def __init__(

def tuner_initialization(
self,
config_path: str,
data_config_path: str,
data_path: str,
encoder_loader: EncoderLoader,
*,
Expand All @@ -138,17 +139,33 @@ def tuner_initialization(

logging.info(f"PER_TRIAL resources -> GPU: {self.gpu_per_trial} CPU: {self.cpu_per_trial}")

# Configure trainable with resources and data
trainable = tune.with_resources(TuneModel, resources={"cpu": self.cpu_per_trial, "gpu": self.gpu_per_trial})
trainable = tune.with_parameters(
trainable,
training=TorchDataset(config_path=config_path, csv_path=data_path, encoder_loader=encoder_loader, split=0),
validation=TorchDataset(
config_path=config_path,
# Pre-load and encode datasets once, then put them in Ray's object store
@ray.remote
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahhh so ray.remote is the correct way to do it :)

def create_datasets(data_config_path: str, data_path: str, encoder_loader: EncoderLoader):
training = TorchDataset(
config_path=data_config_path,
csv_path=data_path,
encoder_loader=encoder_loader,
split=0,
)
validation = TorchDataset(
config_path=data_config_path,
csv_path=data_path,
encoder_loader=encoder_loader,
split=1,
)
return training, validation

# Put datasets in Ray's object store
datasets_ref = create_datasets.remote(data_config_path, data_path, encoder_loader)

# Configure trainable with resources and dataset parameters
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't remember exactly why, but I think initially I put them here to avoid the need for reencoding each time it is tuned.

trainable = tune.with_resources(
tune.with_parameters(
TuneModel,
datasets_ref=datasets_ref,
),
resources={"cpu": self.cpu_per_trial, "gpu": self.gpu_per_trial}
)

return tune.Tuner(trainable, tune_config=self.tune_config, param_space=self.config, run_config=self.run_config)
Expand All @@ -161,20 +178,15 @@ def tune(self) -> None:
class TuneModel(Trainable):
"""Trainable model class for Ray Tune."""

def setup(self, config: dict[Any, Any]) -> None:
def setup(self, config: dict[Any, Any], *, datasets_ref: ray.ObjectRef) -> None:
"""Get the model, loss function(s), optimizer, train and test data from the config."""
# set the seeds the second time, first in TuneWrapper initialization. This will make all important seed worker specific.
# set the seeds the second time, first in TuneWrapper initialization
set_general_seeds(self.config["ray_worker_seed"])

# Initialize model with the config params
self.model = config["model"](**config["model_params"])

# Add data path
self.data_path = config["data_path"]

# Get the loss function(s) from the config model params
# Note that the loss function(s) are stored in a dictionary,
# where the keys are the key of loss_params in the yaml config file and the values are the loss functions associated to such keys.
self.loss_dict = config["loss_params"]
for key, loss_fn in self.loss_dict.items():
try:
Expand All @@ -186,23 +198,29 @@ def setup(self, config: dict[Any, Any]) -> None:

# get the optimizer parameters
optimizer_lr = config["optimizer_params"]["lr"]

# get the optimizer from PyTorch
self.optimizer = getattr(optim, config["optimizer_params"]["method"])(self.model.parameters(), lr=optimizer_lr)
self.optimizer = getattr(optim, config["optimizer_params"]["method"])(
self.model.parameters(),
lr=optimizer_lr
)

# get step size from the config
self.step_size = config["tune"]["step_size"]

# Get datasets from Ray's object store
training, validation = ray.get(datasets_ref)

# use dataloader on training/validation data
self.batch_size = config["data_params"]["batch_size"]
training: Dataset = config["training"]
validation: Dataset = config["validation"]
self.training = DataLoader(
training,
batch_size=self.batch_size,
shuffle=True,
) # TODO need to check the reproducibility of this shuffling
self.validation = DataLoader(validation, batch_size=self.batch_size, shuffle=True)
)
self.validation = DataLoader(
validation,
batch_size=self.batch_size,
shuffle=True
)

# debug section, first create a dedicated directory for each worker inside Ray_results/<tune_model_run_specific_dir> location
debug_dir = os.path.join(
Expand Down