diff --git a/expts/hydra-configs/accelerator/ipu.yaml b/expts/hydra-configs/accelerator/ipu.yaml
index 43e4455ef..3e6fb4429 100644
--- a/expts/hydra-configs/accelerator/ipu.yaml
+++ b/expts/hydra-configs/accelerator/ipu.yaml
@@ -1,6 +1,8 @@
 type: ipu
 ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    # 60 for PCQM4mv2
+    # 30 for largemix
     - replicationFactor(16)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
diff --git a/expts/hydra-configs/dataset/pcqm4m.yaml b/expts/hydra-configs/architecture/pcqm4m.yaml
similarity index 58%
rename from expts/hydra-configs/dataset/pcqm4m.yaml
rename to expts/hydra-configs/architecture/pcqm4m.yaml
index 391bb21de..494875765 100644
--- a/expts/hydra-configs/dataset/pcqm4m.yaml
+++ b/expts/hydra-configs/architecture/pcqm4m.yaml
@@ -1,70 +1,5 @@
 # @package _global_
 
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: "graph"
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]
-        # sample_size: 8000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        # graphium/data/PCQM4Mv2/split_dict.pt
-        # graphium/data/PCQM4Mv2/pcqm4m_split.csv
-        # split_val: 0.1
-        # split_test: 0.1
-        seed: ${constants.seed}
-        label_normalization:
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    processed_graph_data_path: "../datacache/PCQM4Mv2/"
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 30 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
 architecture:
   model_type: FullGraphMultiTaskNetwork
   mup_base_path: null
@@ -144,78 +79,46 @@ architecture:
       last_normalization: "none"
       residual_type: none
 
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2                          # Not needed if we have hidden_dims
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: ["pearsonr"]  
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor homolumo/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore: ignore nan values from loss
-  flag_kwargs:
-    n_steps: 0 # 1
-    alpha: 0.0 # 0.01
-
-# Task-specific
-metrics:
-  homolumo:
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    processed_graph_data_path: ${constants.datacache_path}
+    num_workers: 40 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
 
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/PCQMv2
-    name: ${constants.name}
-    project: PCQMv2_mpnn
-  #early_stopping:
-  #  monitor: *monitor
-  #  min_delta: 0
-  #  patience: 10
-  #  mode: &mode min
-  model_checkpoint:
-    dirpath: models_checkpoints/PCMQ4Mv2/
-    filename: ${constants.name}
-    #monitor: *monitor
-    #mode: *mode
-    save_top_k: 1
-    every_n_epochs: 100
-  trainer:
-    max_epochs: *max_epochs
-    min_epochs: 1
-    check_val_every_n_epoch: 20
diff --git a/expts/hydra-configs/main.yaml b/expts/hydra-configs/main.yaml
index 24962eddc..d4b3beceb 100644
--- a/expts/hydra-configs/main.yaml
+++ b/expts/hydra-configs/main.yaml
@@ -13,4 +13,4 @@ defaults:
 
   # Specializations
   - training/accelerator: ${training}_${accelerator}
-  - training/model: ${training}_${model}
\ No newline at end of file
+  - training/model: ${training}_${model}
diff --git a/expts/hydra-configs/model/gpspp.yaml b/expts/hydra-configs/model/gpspp.yaml
index 0b231fcf1..7ec357fe1 100644
--- a/expts/hydra-configs/model/gpspp.yaml
+++ b/expts/hydra-configs/model/gpspp.yaml
@@ -1,5 +1,15 @@
 # @package _global_
 
+datamodule:
+  args:
+    batch_size_training: 32
+    featurization:
+      conformer_property_list: [positions_3d]
+
+trainer:
+  trainer:
+    accumulate_grad_batches: 2
+
 architecture:
   pe_encoders:
     encoders:
@@ -31,8 +41,3 @@ architecture:
         num_heads: 32
       droppath_rate_attn: 0.0
       droppath_rate_ffn: 0.0
-
-datamodule:
-  args: # Matches that in the test_multitask_datamodule.py case.
-    featurization:
-      conformer_property_list: [positions_3d]
diff --git a/expts/hydra-configs/model/mpnn.yaml b/expts/hydra-configs/model/mpnn.yaml
index 4a8a428e8..dce40c932 100644
--- a/expts/hydra-configs/model/mpnn.yaml
+++ b/expts/hydra-configs/model/mpnn.yaml
@@ -1,5 +1,13 @@
 # @package _global_
 
+datamodule:
+  args:
+    batch_size_training: 64
+
+trainer:
+  trainer:
+    accumulate_grad_batches: 1
+
 architecture:
   gnn:
     layer_type: 'pyg:gps'
diff --git a/expts/hydra-configs/tasks/pcqm4m.yaml b/expts/hydra-configs/tasks/pcqm4m.yaml
new file mode 100644
index 000000000..d92d381f7
--- /dev/null
+++ b/expts/hydra-configs/tasks/pcqm4m.yaml
@@ -0,0 +1,62 @@
+# @package _global_
+
+architecture:
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2                          # Not needed if we have hidden_dims
+      activation: relu
+      last_activation: none
+      dropout: 0.18
+      normalization: layer_norm
+      last_normalization: "none"
+      residual_type: none
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: ["pearsonr"]  
+  loss_fun:
+    homolumo: mae_ipu
+
+# Task-specific
+metrics:
+  homolumo:
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: "graph"
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
+        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]
+        # sample_size: 8000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        # graphium/data/PCQM4Mv2/split_dict.pt
+        # graphium/data/PCQM4Mv2/pcqm4m_split.csv
+        # split_val: 0.1
+        # split_test: 0.1
+        seed: ${constants.seed}
+        label_normalization:
+          method: "normal"
diff --git a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml b/expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml
similarity index 84%
rename from expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml
rename to expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml
index 6502f9414..a7e23f383 100644
--- a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml
+++ b/expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml
@@ -11,15 +11,13 @@ datamodule:
       max_num_nodes_per_graph: 30 # valid max nodes: 51, max_edges: 118
       max_num_edges_per_graph: 120
     # Data handling-related
-    batch_size_training: 32
     batch_size_inference: 16
 
 predictor:
-  metrics_every_n_train_steps: 1000
+  metrics_every_n_train_steps: 100
   optim_kwargs:
     loss_scaling: 1024
 
 trainer:
   trainer:
     precision: 16-true
-    accumulate_grad_batches: 2
diff --git a/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml
similarity index 89%
rename from expts/hydra-configs/experiment/pcqm4m_gpspp.yaml
rename to expts/hydra-configs/training/model/pcqm4m_gpspp.yaml
index a321e835d..e13c44aa0 100644
--- a/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml
+++ b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml
@@ -7,6 +7,7 @@ constants:
   seed: 42
   max_epochs: 100
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
 
 trainer:
   model_checkpoint:
diff --git a/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml
similarity index 88%
rename from expts/hydra-configs/experiment/pcqm4m_mpnn.yaml
rename to expts/hydra-configs/training/model/pcqm4m_mpnn.yaml
index 08b1b1f3c..41b55eba1 100644
--- a/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml
+++ b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml
@@ -7,6 +7,7 @@ constants:
   seed: 42
   max_epochs: 100
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
 
 trainer:
   model_checkpoint:
diff --git a/expts/hydra-configs/training/pcqm4m.yaml b/expts/hydra-configs/training/pcqm4m.yaml
new file mode 100644
index 000000000..910c78c67
--- /dev/null
+++ b/expts/hydra-configs/training/pcqm4m.yaml
@@ -0,0 +1,44 @@
+# @package _global_
+
+predictor:
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor homolumo/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore: ignore nan values from loss
+  flag_kwargs:
+    n_steps: 0 # 1
+    alpha: 0.0 # 0.01
+
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/PCQMv2
+    name: ${constants.name}
+    project: PCQMv2_mpnn
+  #early_stopping:
+  #  monitor: *monitor
+  #  min_delta: 0
+  #  patience: 10
+  #  mode: &mode min
+  model_checkpoint:
+    dirpath: models_checkpoints/PCMQ4Mv2/
+    filename: ${constants.name}
+    #monitor: *monitor
+    #mode: *mode
+    save_top_k: 1
+    every_n_epochs: 100
+  trainer:
+    max_epochs: *max_epochs
+    min_epochs: 1
+    check_val_every_n_epoch: 20
diff --git a/expts/neurips2023_configs/base_config/large.yaml b/expts/neurips2023_configs/base_config/large.yaml
index da9760ab3..f7de28a6c 100644
--- a/expts/neurips2023_configs/base_config/large.yaml
+++ b/expts/neurips2023_configs/base_config/large.yaml
@@ -1,9 +1,10 @@
 # @package _global_
 
 constants:
-  seed: &seed 42
+  seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
   entity: multitask-gnn
+  datacache_path: "/localdata/neurips2023-large/"
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -125,7 +126,7 @@ datamodule:
         # sample_size: 2000 # use sample_size for test
         task_level: node
         splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: *seed
+        seed: ${constants.seed}
         label_normalization:
           normalize_val_test: True
           method: "normal"
@@ -136,7 +137,7 @@ datamodule:
     featurization_n_jobs: 30
     featurization_progress: True
     featurization_backend: "loky"
-    processed_graph_data_path: "../datacache/neurips2023-large/"
+    processed_graph_data_path: ${constants.datacache_path}
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -340,7 +341,7 @@ predictor:
     pcba_1328: bce_logits_ipu
     pcqm4m_g25: mae_ipu
     pcqm4m_n4: mae_ipu
-  random_seed: *seed
+  random_seed: ${constants.seed}
   optim_kwargs:
     lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
     # weight_decay: 1.e-7
@@ -363,6 +364,9 @@ metrics:
       metric: auroc_ipu
       num_classes: 5
       task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
       multitask_handling: mean-per-label
       threshold_kwargs: null
     - name: avpr
@@ -376,15 +380,18 @@ metrics:
       threshold_kwargs: null
   l1000_mcf7: *classif_metrics
   pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
     - name: auroc
-      metric: auroc_ipu
+      metric: auroc
       task: binary
       multitask_handling: mean-per-label
+      target_nan_mask: ignore
       threshold_kwargs: null
     - name: avpr
-      metric: average_precision_ipu
+      metric: averageprecision
       task: binary
       multitask_handling: mean-per-label
+      target_nan_mask: ignore
       threshold_kwargs: null
   pcqm4m_g25: &pcqm_metrics
     - name: mae
@@ -405,7 +412,7 @@ metrics:
   pcqm4m_n4: *pcqm_metrics
 
 trainer:
-  seed: *seed
+  seed: ${constants.seed}
   logger:
     save_dir: logs/neurips2023-large/
     name: ${constants.name}
diff --git a/expts/neurips2023_configs/config_large_gcn.yaml b/expts/neurips2023_configs/config_large_gcn.yaml
index 1dc397998..84f4dcaa6 100644
--- a/expts/neurips2023_configs/config_large_gcn.yaml
+++ b/expts/neurips2023_configs/config_large_gcn.yaml
@@ -9,4 +9,4 @@ constants:
 
 architecture:
   gnn:  # Set as null to avoid a post-nn network
-    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
\ No newline at end of file
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
diff --git a/expts/neurips2023_configs/config_large_mpnn.yaml b/expts/neurips2023_configs/config_large_mpnn.yaml
index b56f8dd2d..365927473 100644
--- a/expts/neurips2023_configs/config_large_mpnn.yaml
+++ b/expts/neurips2023_configs/config_large_mpnn.yaml
@@ -1,4 +1,4 @@
-# Testing the mpnn only model with the PCQMv2 dataset on IPU.
+# Running the mpnn model with the largemix dataset on IPU.
 
 defaults:
  - base_config: large
@@ -7,8 +7,20 @@ constants:
   name: neurips2023_large_data_mpnn
 
 architecture:
-  pre_nn_edges:   # Set as null to avoid a pre-nn network
-    out_dim: 32
+
+  pre_nn:
+    out_dim: 160
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 64
     hidden_dims: 128
     depth: 2
     activation: relu
@@ -19,9 +31,26 @@ architecture:
     residual_type: none
 
   gnn:  # Set as null to avoid a post-nn network
-    out_dim: &gnn_dim 64
-    hidden_dims: *gnn_dim
-    layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
+    in_dim: 160 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
       mpnn_type: 'pyg:mpnnplus'
-      out_dim_edges: 32
+      mpnn_kwargs:
+        in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index e98833b61..6fc139aab 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -464,6 +464,7 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None:
             concatenated_metrics_logs[f"train/loss/{task}"] = outputs["task_losses"][task]
 
         # get the mean loss value for individual tasks as they are a tensor of size --> gradient accumulation * replication * device_iter
+        # filter zeros out for the individual losses
         for key in concatenated_metrics_logs:
             if isinstance(concatenated_metrics_logs[key], torch.Tensor):
                 if concatenated_metrics_logs[key].numel() > 1: