From f05869eed74fb50ce28cfcab1c4a8b800fd869be Mon Sep 17 00:00:00 2001 From: Jagger Denhof <58100760+jdenhof@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:15:51 -0500 Subject: [PATCH] Added parallel configuration for conditional layers (#57) * added version to run-compare * updated run-compare to take command line args * fixed argument parsing order for extra args * proposed meaned loss function after recon + kl * added experiment to isolate adv-cond config * updated workflow memory requirements * fixed find issues in run-compare * fixed run-compare ls issues * added mean and variance to tensorboard log * renamed experiment config to parallel * removed layer and made concat_config ConcatBlockConfig * added parallel support for conditional layers * added comments for alternate loss addition * added scripts to gather from tsv file --- .../parallel/adversarial-conditional.yaml | 130 +++++++++++++++++ configs/model/parallel/conditional.yaml | 127 +++++++++++++++++ scripts/gather-norms.py | 18 +++ scripts/run-compare.sh | 133 +++++++++++++++--- src/cmmvae/models/cmmvae_model.py | 5 + src/cmmvae/modules/base/__init__.py | 2 + src/cmmvae/modules/base/components.py | 33 ++++- src/cmmvae/modules/clvae.py | 47 +++++-- src/cmmvae/modules/vae.py | 16 ++- workflow/profile/slurm/config.yaml | 24 ++-- 10 files changed, 487 insertions(+), 48 deletions(-) create mode 100644 configs/model/parallel/adversarial-conditional.yaml create mode 100644 configs/model/parallel/conditional.yaml create mode 100644 scripts/gather-norms.py diff --git a/configs/model/parallel/adversarial-conditional.yaml b/configs/model/parallel/adversarial-conditional.yaml new file mode 100644 index 0000000..c18d5c4 --- /dev/null +++ b/configs/model/parallel/adversarial-conditional.yaml @@ -0,0 +1,130 @@ +class_path: cmmvae.models.CMMVAEModel +init_args: + kl_annealing_fn: + class_path: cmmvae.modules.base.annealing_fn.LinearKLAnnealingFn + init_args: + min_kl_weight: 0.1 + max_kl_weight: 0.5 + warmup_steps: 1e4 + climax_steps: 6e4 + record_gradients: false + adv_weight: 25 + gradient_record_cap: 20 + autograd_config: + class_path: cmmvae.config.AutogradConfig + init_args: + adversarial_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + vae_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + expert_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + module: + class_path: cmmvae.modules.CMMVAE + init_args: + vae: + class_path: cmmvae.modules.CLVAE + init_args: + latent_dim: 128 + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 256 ] + dropout_rate: 0.0 + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + return_hidden: True + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 128, 256, 512 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + conditional_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 128 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: True + activation_fn: null + concat_config: + class_path: cmmvae.modules.base.ConcatBlockConfig + init_args: + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + conditionals: + - assay + - dataset_id + - donor_id + - species + - tissue + selection_order: + - parallel + experts: + class_path: cmmvae.modules.base.Experts + init_args: + experts: + - class_path: cmmvae.modules.base.Expert + init_args: + id: human + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 60530, 1024, 512 ] + dropout_rate: [ 0.1, 0.0 ] + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 1024, 60530 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + - class_path: cmmvae.modules.base.Expert + init_args: + id: mouse + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 52437, 1024, 512 ] + dropout_rate: [ 0.1, 0.0 ] + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 1024, 52437 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + adversarials: + - class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 256, 128, 64, 1 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: + - torch.nn.ReLU + - torch.nn.ReLU + - torch.nn.Sigmoid diff --git a/configs/model/parallel/conditional.yaml b/configs/model/parallel/conditional.yaml new file mode 100644 index 0000000..92b7420 --- /dev/null +++ b/configs/model/parallel/conditional.yaml @@ -0,0 +1,127 @@ +class_path: cmmvae.models.CMMVAEModel +init_args: + kl_annealing_fn: + class_path: cmmvae.modules.base.annealing_fn.LinearKLAnnealingFn + init_args: + min_kl_weight: 0.1 + max_kl_weight: 1.0 + warmup_steps: 1e4 + climax_steps: 4e4 + record_gradients: false + adv_weight: 0 + gradient_record_cap: 20 + autograd_config: + class_path: cmmvae.config.AutogradConfig + init_args: + adversarial_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + vae_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + expert_gradient_clip: + class_path: cmmvae.config.GradientClipConfig + init_args: + val: 10 + algorithm: norm + module: + class_path: cmmvae.modules.CMMVAE + init_args: + vae: + class_path: cmmvae.modules.CLVAE + init_args: + latent_dim: 128 + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 256 ] + dropout_rate: 0.0 + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + return_hidden: True + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 128, 256, 512 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + conditional_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 128 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: True + activation_fn: null + concat_config: + class_path: cmmvae.modules.base.ConcatBlockConfig + init_args: + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + conditionals: + - assay + - dataset_id + - donor_id + - species + - tissue + selection_order: + - parallel + experts: + class_path: cmmvae.modules.base.Experts + init_args: + experts: + - class_path: cmmvae.modules.base.Expert + init_args: + id: human + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 60530, 1024, 512 ] + dropout_rate: [ 0.1, 0.0 ] + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 1024, 60530 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + - class_path: cmmvae.modules.base.Expert + init_args: + id: mouse + encoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 52437, 1024, 512 ] + dropout_rate: [ 0.1, 0.0 ] + use_batch_norm: True + use_layer_norm: False + activation_fn: torch.nn.ReLU + decoder_config: + class_path: cmmvae.modules.base.FCBlockConfig + init_args: + layers: [ 512, 1024, 52437 ] + dropout_rate: 0.0 + use_batch_norm: False + use_layer_norm: False + activation_fn: torch.nn.ReLU + adversarials: + # - class_path: cmmvae.modules.base.FCBlockConfig + # init_args: + # layers: [ 256, 128, 64, 1 ] + # dropout_rate: 0.0 + # use_batch_norm: False + # use_layer_norm: False + # activation_fn: torch.nn.Sigmoid diff --git a/scripts/gather-norms.py b/scripts/gather-norms.py new file mode 100644 index 0000000..242555a --- /dev/null +++ b/scripts/gather-norms.py @@ -0,0 +1,18 @@ +import tensorflow as tf +import pandas as pd + +# Replace with the path to your TensorBoard log file +log_file = "/mnt/projects/debruinz_project/denhofja/cmmvae/lightning_logs/run-experiment/adversarial-conditional.5a9df4a./events.out.tfevents.1730760440.g001.clipper.gvsu.edu.3079995.0" +data = [] + +for event in tf.compat.v1.train.summary_iterator(log_file): + for value in event.summary.value: + # Modify 'grad_norm' to the exact tag name used for gradient norms in your logs + if "grad_norm" in value.tag: + data.append( + {"step": event.step, "grad_norm": value.simple_value, "tag": value.tag} + ) + +# Convert to DataFrame and save to CSV +df = pd.DataFrame(data) +df.to_csv("gradient_data.json") diff --git a/scripts/run-compare.sh b/scripts/run-compare.sh index 5d73963..1b63596 100755 --- a/scripts/run-compare.sh +++ b/scripts/run-compare.sh @@ -1,29 +1,124 @@ #!/bin/bash -if [ -z "$1" ]; then - echo "No argument provided for the name of the experiment!" - exit 1 +debug=false +append_commit_hash=true +root_dir="${CMMVAE_ROOT_DIR}" +experiment="${CMMVAE_EXPERIMENT_NAME}" +data="${CMMVAE_DATA_CONFIG}" +compare="" +max_epochs="${CMMVAE_MAX_EPOCHS}" +commit_hash="" +extra_args="" + +if [ -z "${max_epochs}" ]; then + max_epochs=5 +else + echo "CMMVAE_MAX_EPOCHS is set to '$CMMVAE_MAX_EPOCHS'" fi -if [ -z "$2" ]; then - echo "No argument provided for model config filename!" +if [ -z "${data}" ]; then + data=configs/data/local.yaml +else + echo "CMMVAE_DATA_CONFIG is set to '$CMMVAE_DATA_CONFIG'" +fi + +if [ -z "${root_dir}" ]; then + root_dir=lightning_logs +else + echo "CMMVAE_ROOT_DIR is set to '$CMMVAE_ROOT_DIR'" +fi + +if [ -z "${experiment}" ]; then + experiment=default +else + echo "CMMVAE_EXPERIMENT_NAME is set to '$CMMVAE_EXPERIMENT_NAME'" +fi + +for arg in "$@" +do + case $arg in + --debug) + debug=true + shift + ;; + --no-commit-hash) + append_commit_hash=false + shift + ;; + root_dir=*) + root_dir="${arg#*=}" + shift + ;; + experiment=*) + experiment="${arg#*=}" + shift + ;; + compare=*) + compare="${arg#*=}" + shift + ;; + data=*) + data="${arg#*=}" + shift + ;; + max_epochs=*) + max_epochs="${arg#*=}" + shift + ;; + *) + extra_args="$extra_args $arg" + shift + ;; + esac +done + +if [ -z "$compare" ]; then + echo "Please specify directory that contains model configs to compare." exit 1 fi -for file in "$2"/*.yaml +if [ "$append_commit_hash" = true ]; then + if ! command -v git &> /dev/null; then + echo "Error: Git is not installed. Please install Git to use this script or specify --no-commit-hash." + exit 1 + fi + + if ! git rev-parse --is-inside-work-tree &> /dev/null; then + echo "Error: This is not a Git repository. Please run the script inside a Git repository or specify --no-commit-hash." + exit 1 + fi + + commit_hash=$(git rev-parse --short HEAD) + echo "Latest Commit Hash: $commit_hash" +else + echo "Skipping commit hash display." +fi + +for file in "$compare"/*.yaml do - filename=$(basename "$file" .yaml) - echo "Processing: $filename" - sbatch scripts/run-snakemake.sh --config \ - experiment_name=$1\ - run_name=$filename \ - root_dir=lightning_logs \ - train_command=\ -"\ -fit \ ---data configs/data/local.yaml \ ---model $file \ ---trainer.max_epochs 5 \ -" + run_name=$(basename "$file" .yaml) + + if [ "$commit_hash" != "" ]; then + run_name="${run_name}.${commit_hash}" + fi + + ran_dirs=$(ls -d "$root_dir/$experiment/$run_name"* 2>/dev/null) + + if [ -z "$ran_dirs" ]; then + version="V000" + else + version=$(echo "$ran_dirs" | grep -E 'V[0-9]{3}$' | sort -V | tail -n 1 | sed -E 's/.*V([0-9]{3})$/\1/' | awk '{printf "V%03d", $1 + 1}') + fi + echo "Processing: $file" + command="scripts/run-snakemake.sh --config \ + root_dir=${root_dir} \ + experiment_name=${experiment} \ + run_name=${version}.${run_name} \ + train_command=\"fit --model $file --data $data --trainer.max_epochs $max_epochs $extra_args\" + " + echo $command + if [ "$debug" = false ]; then + eval "sbatch $command" + fi done diff --git a/src/cmmvae/models/cmmvae_model.py b/src/cmmvae/models/cmmvae_model.py index 2a74abc..6ec411a 100644 --- a/src/cmmvae/models/cmmvae_model.py +++ b/src/cmmvae/models/cmmvae_model.py @@ -206,6 +206,7 @@ def training_step( qz, pz, z, xhats, hidden_representations = self.module( x=x, metadata=metadata, expert_id=expert_id ) + # assert isinstance(qz, torch.distributions.Normal) if x.layout == torch.sparse_csr: x = x.to_dense() @@ -214,6 +215,10 @@ def training_step( main_loss_dict = self.module.vae.elbo( qz, pz, x, xhats[expert_id], self.kl_annealing_fn.kl_weight ) + + main_loss_dict["Mean"] = qz.mean.mean() + main_loss_dict["Variance"] = qz.variance.mean() + total_loss = main_loss_dict[RK.LOSS] adv_loss = None diff --git a/src/cmmvae/modules/base/__init__.py b/src/cmmvae/modules/base/__init__.py index 56a2312..90dfb1c 100644 --- a/src/cmmvae/modules/base/__init__.py +++ b/src/cmmvae/modules/base/__init__.py @@ -10,6 +10,7 @@ ConditionalLayer, ConditionalLayers, GradientReversalFunction, + ConcatBlockConfig, ) from cmmvae.modules.base.annealing_fn import KLAnnealingFn, LinearKLAnnealingFn @@ -17,6 +18,7 @@ __all__ = [ "ConditionalLayer", "ConditionalLayers", + "ConcatBlockConfig", "Encoder", "Expert", "Experts", diff --git a/src/cmmvae/modules/base/components.py b/src/cmmvae/modules/base/components.py index 40dbc9c..19d7ae4 100644 --- a/src/cmmvae/modules/base/components.py +++ b/src/cmmvae/modules/base/components.py @@ -174,6 +174,22 @@ def validate(self): self._validate_option(name, req_type, **kwargs) +class ConcatBlockConfig(FCBlockConfig): + def __init__( + self, + dropout_rate: float = 0.0, + use_batch_norm: bool = False, + use_layer_norm: bool = False, + return_hidden: bool = False, + activation_fn: Optional[Type[nn.Module]] = None, + ): + self.dropout_rate = dropout_rate + self.use_batch_norm = use_batch_norm + self.use_layer_norm = use_layer_norm + self.return_hidden = return_hidden + self.activation_fn = activation_fn + + class FCBlock(nn.Module): """ Fully Connected Block for building neural network layers. @@ -500,7 +516,8 @@ def __init__( self.shared_conditionals = list(conditional_paths["shared"].keys()) self.shuffle_selection_order = False - if not selection_order: + self.is_parallel = selection_order[0] == "parallel" + if not selection_order or self.is_parallel: selection_order = conditionals self.shuffle_selection_order = True @@ -589,6 +606,7 @@ def forward( else: order = self.selection_order + xs = [] # Apply each layer in the determined order for conditional in order: layer = self.layers[conditional] @@ -599,10 +617,17 @@ def forward( ) layer = layer[species] if isinstance(layer, ConditionalLayer): - x = layer(x, metadata) + if self.is_parallel: + xs.append(layer(x, metadata)) + else: + x = layer(x, metadata) else: - x = layer(x) - + if self.is_parallel: + xs.append(layer(x)) + else: + x = layer(x) + if xs: + x = torch.cat(xs, dim=1) return x diff --git a/src/cmmvae/modules/clvae.py b/src/cmmvae/modules/clvae.py index f84199a..a446631 100644 --- a/src/cmmvae/modules/clvae.py +++ b/src/cmmvae/modules/clvae.py @@ -4,7 +4,7 @@ import pandas as pd from cmmvae.modules.vae import VAE -from cmmvae.modules.base import FCBlockConfig, ConditionalLayers +from cmmvae.modules.base import FCBlockConfig, ConditionalLayers, ConcatBlockConfig class CLVAE(VAE): @@ -36,27 +36,56 @@ def __init__( conditionals_directory: Optional[str] = None, conditionals: Optional[list[str]] = None, selection_order: Optional[list[str]] = None, + concat_config: Optional[ConcatBlockConfig] = None, **encoder_kwargs ): - super().__init__( - encoder_config=encoder_config, - decoder_config=decoder_config, - **encoder_kwargs, - ) - + conditionals_module = None if conditional_config and conditionals and conditionals_directory: - self.conditionals = ConditionalLayers( + conditionals_module = ConditionalLayers( directory=conditionals_directory, conditionals=conditionals, fc_block_config=conditional_config, selection_order=selection_order, ) else: - self.conditionals = None import warnings warnings.warn("No conditionals found for vae") + if selection_order and selection_order[0] == "parallel": + if not concat_config: + raise RuntimeError( + "Please define concat_config when selection_order = parallel" + ) + concat_dim = ( + len(conditionals_module.selection_order) * conditional_config.layers[-1] + ) + + decoder_config.layers = [concat_dim] + decoder_config.layers + decoder_config.activation_fn = [ + concat_config.activation_fn + ] + decoder_config.activation_fn + decoder_config.dropout_rate = [ + concat_config.dropout_rate + ] + decoder_config.dropout_rate + decoder_config.return_hidden = [ + concat_config.return_hidden + ] + decoder_config.return_hidden + decoder_config.use_layer_norm = [ + concat_config.use_layer_norm + ] + decoder_config.use_layer_norm + decoder_config.use_batch_norm = [ + concat_config.use_batch_norm + ] + decoder_config.use_batch_norm + + super().__init__( + encoder_config=encoder_config, + decoder_config=decoder_config, + **encoder_kwargs, + ) + + self.conditionals = conditionals_module + def after_reparameterize( self, z: torch.Tensor, metadata: pd.DataFrame, **kwargs ) -> torch.Tensor: diff --git a/src/cmmvae/modules/vae.py b/src/cmmvae/modules/vae.py index 8d6de36..7964f08 100644 --- a/src/cmmvae/modules/vae.py +++ b/src/cmmvae/modules/vae.py @@ -133,19 +133,27 @@ def elbo( - RK.LOSS: Total loss. - RK.KL_WEIGHT: KL weight. """ - z_kl_div = kl_divergence(qz, pz).sum(dim=-1) + z_kl_div = kl_divergence(qz, pz) + z_kl_div = z_kl_div.sum(dim=-1) + z_kl_div = z_kl_div.mean() if x.layout == torch.sparse_csr: x = x.to_dense() recon_loss = F.mse_loss(xhat, x, reduction="sum") + # recon_loss = F.mse_loss(xhat, x, reduction="none") + # recon_loss = recon_loss.sum(dim=1) - loss = recon_loss + kl_weight * z_kl_div.mean() + loss = recon_loss + (kl_weight * z_kl_div) + # loss = torch.mean(z_kl_div * kl_weight + recon_loss) + + recon_loss = recon_loss / x.numel() + # recon_loss = recon_loss.mean() return { - RK.RECON_LOSS: recon_loss / x.numel(), - RK.KL_LOSS: z_kl_div.mean(), RK.LOSS: loss, + RK.RECON_LOSS: recon_loss, + RK.KL_LOSS: z_kl_div, RK.KL_WEIGHT: kl_weight, } diff --git a/workflow/profile/slurm/config.yaml b/workflow/profile/slurm/config.yaml index c8193fb..04557ba 100644 --- a/workflow/profile/slurm/config.yaml +++ b/workflow/profile/slurm/config.yaml @@ -26,41 +26,41 @@ jobs: 10 set-resources: diff_expression: partition: bigmem - mem: 179GB + mem: 100GB gpus_per_node: "" cpus_per_task: 1 train: partition: gpu - mem: 179GB + mem: 100GB gpus_per_node: tesla_v100s:1 - cpus_per_task: 12 + cpus_per_task: 6 predict: partition: gpu - mem: 179GB + mem: 100GB gpus_per_node: 1 - cpus_per_task: 12 + cpus_per_task: 6 merge_predictions: partition: all - mem: 179GB + mem: 100GB gpus_per_node: "" cpus_per_task: 1 correlations: partition: gpu - mem: 179GB + mem: 100GB gpus_per_node: 1 - cpus_per_task: 12 + cpus_per_task: 6 run_correlations: partition: cpu - mem: 179GB + mem: 100GB gpus_per_node: "" cpus_per_task: 1 umap_predictions: partition: all - mem: 179GB + mem: 100GB gpus_per_node: "" cpus_per_task: 40 meta_discriminators: partition: gpu - mem: 179GB + mem: 100GB gpus_per_node: tesla_v100s:1 - cpus_per_task: 12 + cpus_per_task: 6