From 055cad14ad40b5039d800abe7bd7282b784e6fce Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Thu, 31 Aug 2023 15:56:08 +0000
Subject: [PATCH 01/58] fix pyproject.toml to graphium.cli.train_finetune_test

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f24f61c82..78a5869da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,7 +64,7 @@ dependencies = [
 
 [project.scripts]
 graphium = "graphium.cli.main:app"
-graphium-train = "graphium.cli.train_finetune:cli"
+graphium-train = "graphium.cli.train_finetune_test:cli"
 
 [project.urls]
 Website = "https://graphium.datamol.io/"

From aa7e79395db076274b01059e747d5b45cbc5005e Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 4 Sep 2023 14:48:08 +0000
Subject: [PATCH 02/58] Passing pipleine / accelerator args to model

---
 .../accelerator/ipu_pipeline.yaml             | 22 +++++++++++++++++++
 graphium/config/_loader.py                    |  3 +++
 2 files changed, 25 insertions(+)
 create mode 100644 expts/hydra-configs/accelerator/ipu_pipeline.yaml

diff --git a/expts/hydra-configs/accelerator/ipu_pipeline.yaml b/expts/hydra-configs/accelerator/ipu_pipeline.yaml
new file mode 100644
index 000000000..996218646
--- /dev/null
+++ b/expts/hydra-configs/accelerator/ipu_pipeline.yaml
@@ -0,0 +1,22 @@
+type: ipu
+ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    # 60 for PCQM4mv2
+    # 30 for largemix
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+
+ipu_inference_config:
+    # set device iteration and replication factor to 1 during inference
+    # gradient accumulation was set to 1 in the code
+    - deviceIterations(60)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]
\ No newline at end of file
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 3235c9b68..85e94b197 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -251,6 +251,8 @@ def load_architecture(
     # Set the parameters for the full network
     task_heads_kwargs = omegaconf.OmegaConf.to_object(task_heads_kwargs)
 
+    # Get accelerator_kwargs if they exist
+    accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None)
     # Set all the input arguments for the model
     model_kwargs = dict(
         gnn_kwargs=gnn_kwargs,
@@ -259,6 +261,7 @@ def load_architecture(
         pe_encoders_kwargs=pe_encoders_kwargs,
         graph_output_nn_kwargs=graph_output_nn_kwargs,
         task_heads_kwargs=task_heads_kwargs,
+        accelerator_kwargs=accelerator_kwargs
     )
 
     if model_class is FullGraphFinetuningNetwork:

From 341a6563ca061e5866960e93858e26c1774677a5 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 5 Sep 2023 10:00:35 +0000
Subject: [PATCH 03/58] Reworking the model kwargs

---
 graphium/config/_loader.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 85e94b197..c9d3d30dd 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -251,8 +251,6 @@ def load_architecture(
     # Set the parameters for the full network
     task_heads_kwargs = omegaconf.OmegaConf.to_object(task_heads_kwargs)
 
-    # Get accelerator_kwargs if they exist
-    accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None)
     # Set all the input arguments for the model
     model_kwargs = dict(
         gnn_kwargs=gnn_kwargs,
@@ -260,9 +258,12 @@ def load_architecture(
         pre_nn_edges_kwargs=pre_nn_edges_kwargs,
         pe_encoders_kwargs=pe_encoders_kwargs,
         graph_output_nn_kwargs=graph_output_nn_kwargs,
-        task_heads_kwargs=task_heads_kwargs,
-        accelerator_kwargs=accelerator_kwargs
+        task_heads_kwargs=task_heads_kwargs
     )
+    # Get accelerator_kwargs if they exist
+    accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None)
+    if accelerator_kwargs is not None:
+        model_kwargs["accelerator_kwargs"] = accelerator_kwargs
 
     if model_class is FullGraphFinetuningNetwork:
         finetuning_head_kwargs = config["finetuning"].pop("finetuning_head", None)

From bf5379db1c8360825e71fd392b1649d56af3c733 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 5 Sep 2023 10:01:56 +0000
Subject: [PATCH 04/58] linting loader

---
 graphium/config/_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index c9d3d30dd..da55a9266 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -258,7 +258,7 @@ def load_architecture(
         pre_nn_edges_kwargs=pre_nn_edges_kwargs,
         pe_encoders_kwargs=pe_encoders_kwargs,
         graph_output_nn_kwargs=graph_output_nn_kwargs,
-        task_heads_kwargs=task_heads_kwargs
+        task_heads_kwargs=task_heads_kwargs,
     )
     # Get accelerator_kwargs if they exist
     accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None)

From 4db06f0fb6819deafd9af227bd06b1925a7af7b5 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 5 Sep 2023 10:22:21 +0000
Subject: [PATCH 05/58] Zinc config update for test including accelerator

---
 graphium/config/zinc_default_multitask_pyg.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/graphium/config/zinc_default_multitask_pyg.yaml b/graphium/config/zinc_default_multitask_pyg.yaml
index 07ae4bf9b..b9435ec7e 100644
--- a/graphium/config/zinc_default_multitask_pyg.yaml
+++ b/graphium/config/zinc_default_multitask_pyg.yaml
@@ -181,3 +181,5 @@ architecture:     # The parameters for the full graph network are taken from `co
       dropout: 0.2
       normalization: none
       residual_type: none
+accelerator:
+  type: cpu
\ No newline at end of file

From 20434b62669313745b02af7db18f61e07aa875b2 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 5 Sep 2023 15:46:56 +0000
Subject: [PATCH 06/58] Fix to allow use edges for MPNN layer

---
 expts/hydra-configs/model/mpnn.yaml           |  2 ++
 .../nn/architectures/global_architectures.py  | 27 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/expts/hydra-configs/model/mpnn.yaml b/expts/hydra-configs/model/mpnn.yaml
index dce40c932..d37eecd36 100644
--- a/expts/hydra-configs/model/mpnn.yaml
+++ b/expts/hydra-configs/model/mpnn.yaml
@@ -22,3 +22,5 @@ architecture:
       attn_type: "none" # "full-attention", "none"
       # biased_attention: false
       attn_kwargs: null
+    virtual_node: 'sum'
+    use_virtual_edges: true
diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py
index 903803642..69b235c1d 100644
--- a/graphium/nn/architectures/global_architectures.py
+++ b/graphium/nn/architectures/global_architectures.py
@@ -12,6 +12,7 @@
 from torch import Tensor, nn
 import torch
 from torch_geometric.data import Data
+from omegaconf import DictConfig, OmegaConf
 
 # graphium imports
 from graphium.data.utils import get_keys
@@ -592,6 +593,26 @@ def _check_bad_arguments(self):
             (self.in_dim_edges > 0) or (self.full_dims_edges is not None)
         ) and not self.layer_class.layer_supports_edges:
             raise ValueError(f"Cannot use edge features with class `{self.layer_class}`")
+    
+    def get_nested_key(self, d, target_key):
+        """
+        Get the value associated with a key in a nested dictionary.
+        
+        Parameters:
+        - d: The dictionary to search in
+        - target_key: The key to search for
+        
+        Returns:
+        - The value associated with the key if found, None otherwise
+        """
+        if target_key in d:
+            return d[target_key]
+        for key, value in d.items():
+            if isinstance(value, (dict, DictConfig)):
+                nested_result = self.get_nested_key(value, target_key)
+                if nested_result is not None:
+                    return nested_result
+        return None
 
     def _create_layers(self):
         r"""
@@ -632,6 +653,7 @@ def _create_layers(self):
 
             # Find the edge key-word arguments depending on the layer type and residual connection
             this_edge_kwargs = {}
+            # import ipdb; ipdb.set_trace()
             if self.layer_class.layer_supports_edges and self.in_dim_edges > 0:
                 this_edge_kwargs["in_dim_edges"] = this_in_dim_edges
                 if "out_dim_edges" in inspect.signature(self.layer_class.__init__).parameters.keys():
@@ -639,8 +661,10 @@ def _create_layers(self):
                         this_out_dim_edges = self.full_dims_edges[ii + 1]
                         this_edge_kwargs["out_dim_edges"] = this_out_dim_edges
                     else:
-                        this_out_dim_edges = self.layer_kwargs.get("out_dim_edges")
+                        this_out_dim_edges = self.get_nested_key(self.layer_kwargs, "out_dim_edges")
+                        this_edge_kwargs["out_dim_edges"] = this_out_dim_edges
                     layer_out_dims_edges.append(this_out_dim_edges)
+            # import ipdb; ipdb.set_trace()
 
             # Create the GNN layer
             self.layers.append(
@@ -659,6 +683,7 @@ def _create_layers(self):
 
             # Create the Virtual Node layer, except at the last layer
             if ii < len(residual_out_dims):
+                # import ipdb; ipdb.set_trace()
                 self.virtual_node_layers.append(
                     self.virtual_node_class(
                         in_dim=this_out_dim * self.layers[-1].out_dim_factor,

From daf011c480dc8e4b7165cc5018b8d876cb780144 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 5 Sep 2023 15:52:51 +0000
Subject: [PATCH 07/58] linting + remove debug statements

---
 graphium/nn/architectures/global_architectures.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py
index 69b235c1d..a90a2cba9 100644
--- a/graphium/nn/architectures/global_architectures.py
+++ b/graphium/nn/architectures/global_architectures.py
@@ -593,15 +593,15 @@ def _check_bad_arguments(self):
             (self.in_dim_edges > 0) or (self.full_dims_edges is not None)
         ) and not self.layer_class.layer_supports_edges:
             raise ValueError(f"Cannot use edge features with class `{self.layer_class}`")
-    
+
     def get_nested_key(self, d, target_key):
         """
         Get the value associated with a key in a nested dictionary.
-        
+
         Parameters:
         - d: The dictionary to search in
         - target_key: The key to search for
-        
+
         Returns:
         - The value associated with the key if found, None otherwise
         """
@@ -653,7 +653,6 @@ def _create_layers(self):
 
             # Find the edge key-word arguments depending on the layer type and residual connection
             this_edge_kwargs = {}
-            # import ipdb; ipdb.set_trace()
             if self.layer_class.layer_supports_edges and self.in_dim_edges > 0:
                 this_edge_kwargs["in_dim_edges"] = this_in_dim_edges
                 if "out_dim_edges" in inspect.signature(self.layer_class.__init__).parameters.keys():
@@ -664,7 +663,6 @@ def _create_layers(self):
                         this_out_dim_edges = self.get_nested_key(self.layer_kwargs, "out_dim_edges")
                         this_edge_kwargs["out_dim_edges"] = this_out_dim_edges
                     layer_out_dims_edges.append(this_out_dim_edges)
-            # import ipdb; ipdb.set_trace()
 
             # Create the GNN layer
             self.layers.append(
@@ -683,7 +681,6 @@ def _create_layers(self):
 
             # Create the Virtual Node layer, except at the last layer
             if ii < len(residual_out_dims):
-                # import ipdb; ipdb.set_trace()
                 self.virtual_node_layers.append(
                     self.virtual_node_class(
                         in_dim=this_out_dim * self.layers[-1].out_dim_factor,

From 4a9893fed93b5b785a200dd2421098c1f6e32816 Mon Sep 17 00:00:00 2001
From: DomInvivo <dominique@invivoai.com>
Date: Thu, 7 Sep 2023 17:04:50 -0400
Subject: [PATCH 08/58] Remove the gpu logging, it's handled automatically by
 lightning, and it was not right

---
 graphium/trainer/predictor_summaries.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/graphium/trainer/predictor_summaries.py b/graphium/trainer/predictor_summaries.py
index d62e50a42..8ce863e74 100644
--- a/graphium/trainer/predictor_summaries.py
+++ b/graphium/trainer/predictor_summaries.py
@@ -248,8 +248,6 @@ def get_metrics_logs(self) -> Dict[str, Any]:
         metric_logs[self.metric_log_name(self.task_name, "median_target", self.step_name)] = nan_median(
             targets
         )
-        if torch.cuda.is_available():
-            metric_logs[f"gpu_allocated_GB"] = torch.tensor(torch.cuda.memory_allocated() / (2**30))
 
         # Specify which metrics to use
         metrics_to_use = self.metrics

From c1dea0de60b1049a4a56649518372490fad0a6e8 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 8 Sep 2023 12:23:14 +0000
Subject: [PATCH 09/58] added iclr configs

---
 .../.config_ogbpcq_mpnn_GPS++.yaml.swp        | Bin 0 -> 12288 bytes
 expts/iclr2023_configs/__init__.py            |   0
 .../base_config/ogbpcqm4mv2.yaml              | 288 ++++++++++++++++++
 .../base_config/ogbpcqm4mv2_GPS++.yaml        | 288 ++++++++++++++++++
 .../iclr2023_configs/config_ogbpcq_mpnn.yaml  |  57 ++++
 .../config_ogbpcq_mpnn_GPS++.yaml             |  47 +++
 .../config_ogbpcq_mpnn_JosefOG.yaml           |  57 ++++
 .../config_ogbpcq_mpnn_hydradims.yaml         | 101 ++++++
 .../config_ogbpcq_mpnn_largerffn.yaml         |  58 ++++
 9 files changed, 896 insertions(+)
 create mode 100644 expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp
 create mode 100644 expts/iclr2023_configs/__init__.py
 create mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
 create mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml

diff --git a/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp b/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp
new file mode 100644
index 0000000000000000000000000000000000000000..78ab30c0ce46b96b10e0a72414b1d02a037cb2fe
GIT binary patch
literal 12288
zcmeHN&x;&I6s`!0CT3mYB^TjkMqD+{uiiB*EeOd)f(Ri?Jc!WLOxN@j+g;suRnKM=
z6};$0uX<4M;{PCuH**&8FAxGAym`&Vud2Jpo?T4#AWEbPK6?69y;rZk_v#J9yy^JO
z>u=EhaKbRQ8T)PPUVL|T@zJ{<Utou#)JCd9J9WP?=Vdmp(v+u0l!_a<6oX}OQsaZx
z;3z41v5-|Z61NMZM{<^y!8q8BX1SWnM33rclqb_-_D+-)N=2{TIJkUyxZ+uQ(y*6!
z20R0&VBkF44JKo|u;05xPhYro3LU@1GvFEU40r}S1D*lTfM>un;2HS8Fkp%^>_Z&p
zk>)_7=6UAWvw8Io&wyvZGvFEU40r}S1D*lTfM>un;2H1?cm^Iq26)WaoyQsb>IpcH
z|Nq<H|975Z>_^}`;0xe0;8WluU<z>H0JsTU117*Tz&3Dii?LsUpMcMSw}5NFRbU(V
z`8;Dk0IvhD0Da&w;EyMv1AGH~0w~~Z;3{wl*aE&jhrNJr&ocHZa1Qt#zW)Mz57={j
z3Had|@C<kcJOiEq&wyv(p=5wCDlMX=EKSAJNW_WIA(^rgM9j{%Gb)coNISa9cAT7u
z8>6DbBVH!ZQ0UHV<b9e-u0<SSflx-~Dx^72wdm@ZRRME7l`GMsKJ{#O=U}bSd#Raf
zREYA*M6t|T2PeTkwDcmc(wNZ8sin4i(UCL@DoPQdpVOUX*eKG`LMY0r)W|`~SE3}_
z^GQ<M<Hz*8T^waNV@huNh4s!-{YM)&o`*CU2YU}X?P<?x!+5`Q{yV$5e%mH>8ls)3
z&WfF4m4x=aNGsiDn{&JrTE@uJZbG?mwRz}_)sl{_q;O{XgwdLa>Uk}*BDI<6CR46W
zr1CQ3seBi0#x`|d+<0$SZU~Xc^`2JRYnQ}jULb}cjnVtu-GI-~z}#537oeT&$`g@R
ztvnWmS%kFLzAt1P3zwC+c+VwBb=(ZP@{i@ne6Iew1df+=Dh4lKX}zfL#qoTZiodyz
zZGu7_9*35X17V13S31b##Fx2@iBpklGr-+c!W`u#dcz6hSlAlc2!5W7*t75Pa<-60
z%uH2^VO<V}8xeco8Ql<tbZWhBWcQsV=SDYvYX`f+vJNUy6VbrwZiI8UmaPd#p~IqX
zMGNuMxN|(b5-FacnxdIKLN|N1mo0-%OFOgm(Qsw7T|1yii*z2@Pg9iXrX${Iua#XZ
wJd4GgqhMOKKHaR8k}4sy5OwLSi=`{LR+i#>KxFbZ#oWNGp<L1R8@Gn+PyJzLy8r+H

literal 0
HcmV?d00001

diff --git a/expts/iclr2023_configs/__init__.py b/expts/iclr2023_configs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
new file mode 100644
index 000000000..571788581
--- /dev/null
+++ b/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
@@ -0,0 +1,288 @@
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+# accelerator:
+#   type: cpu  # cpu or ipu or gpu
+#   config_override:
+#     datamodule:
+#       args:
+#         batch_size_training: 64
+#         batch_size_inference: 256
+#     trainer:
+#       trainer:
+#         precision: 32
+#         accumulate_grad_batches: 1
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk" 
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null  # Set as null to avoid a pre-nn network
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 768
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []  
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+  
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
new file mode 100644
index 000000000..09174a096
--- /dev/null
+++ b/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
@@ -0,0 +1,288 @@
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+# accelerator:
+#   type: cpu  # cpu or ipu or gpu
+#   config_override:
+#     datamodule:
+#       args:
+#         batch_size_training: 64
+#         batch_size_inference: 256
+#     trainer:
+#       trainer:
+#         precision: 32
+#         accumulate_grad_batches: 1
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk" 
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null  # Set as null to avoid a pre-nn network
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 1024
+    hidden_dims: *gnn_dim
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 512
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []  
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+  
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
new file mode 100644
index 000000000..e57a403d7
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
@@ -0,0 +1,57 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+defaults:
+ - base_config: ogbpcqm4mv2
+ - _self_
+
+constants:
+  name: ogb_pcqm4mv2_mpnn
+
+architecture:
+
+  pre_nn:
+    out_dim: 160
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 64
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 160 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
new file mode 100644
index 000000000..1f88ad529
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -0,0 +1,47 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+defaults:
+ - base_config: ogbpcqm4mv2_GPS++
+ - _self_
+
+constants:
+  name: ogb_pcqm4mv2_mpnn_GPS++
+
+architecture:
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 32
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 1024 # should be consistent with pre_nn.out_dim
+  #    out_dim: 256
+  #    hidden_dims: &gnn_dim 64 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  #    depth: 4
+  #    activation: gelu
+  #    last_activation: none
+  #    dropout: 0.1
+  #    normalization: "layer_norm"
+  #    last_normalization: *normalization
+  #    residual_type: simple
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 1024 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 1024 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+    virtual_node: 'sum'
+    use_virtual_edges: true  
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
new file mode 100644
index 000000000..e57a403d7
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
@@ -0,0 +1,57 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+defaults:
+ - base_config: ogbpcqm4mv2
+ - _self_
+
+constants:
+  name: ogb_pcqm4mv2_mpnn
+
+architecture:
+
+  pre_nn:
+    out_dim: 160
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 64
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 160 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
new file mode 100644
index 000000000..0dbea9917
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
@@ -0,0 +1,101 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+defaults:
+ - base_config: ogbpcqm4mv2
+ - _self_
+
+constants:
+  name: ogb_pcqm4mv2_mpnn
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:   # Set as null to avoid a pre-nn network
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: *normalization
+    last_normalization: *normalization
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    out_dim: 256
+    hidden_dims: 256
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    layer_type: 'pyg:gps'
+    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256
+        out_dim: 256
+        in_dim_edges: 128
+        out_dim_edges: 128
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+    virtual_node: 'sum'
+    use_virtual_edges: true
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: 256
+      hidden_dims: 256
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+  
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml
new file mode 100644
index 000000000..1a51857fe
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml
@@ -0,0 +1,58 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+defaults:
+ - base_config: ogbpcqm4mv2
+ - _self_
+
+constants:
+  name: ogb_pcqm4mv2_mpnn
+
+architecture:
+
+  pre_nn:
+    out_dim: 280
+    hidden_dims: 512 
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 64
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 280 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+    virtual_node: 'sum'
+    use_virtual_edges: true  

From c8d9f2dba33cfbe37171f461f92aec3cc485d418 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 8 Sep 2023 14:46:13 +0000
Subject: [PATCH 10/58] make one big sweep config

---
 .../.config_ogbpcq_mpnn_GPS++.yaml.swp        | Bin 12288 -> 0 bytes
 .../config_ogbpcq_mpnn_GPS++.yaml             | 285 ++++++++++++++++--
 2 files changed, 266 insertions(+), 19 deletions(-)
 delete mode 100644 expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp

diff --git a/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp b/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp
deleted file mode 100644
index 78ab30c0ce46b96b10e0a72414b1d02a037cb2fe..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeHN&x;&I6s`!0CT3mYB^TjkMqD+{uiiB*EeOd)f(Ri?Jc!WLOxN@j+g;suRnKM=
z6};$0uX<4M;{PCuH**&8FAxGAym`&Vud2Jpo?T4#AWEbPK6?69y;rZk_v#J9yy^JO
z>u=EhaKbRQ8T)PPUVL|T@zJ{<Utou#)JCd9J9WP?=Vdmp(v+u0l!_a<6oX}OQsaZx
z;3z41v5-|Z61NMZM{<^y!8q8BX1SWnM33rclqb_-_D+-)N=2{TIJkUyxZ+uQ(y*6!
z20R0&VBkF44JKo|u;05xPhYro3LU@1GvFEU40r}S1D*lTfM>un;2HS8Fkp%^>_Z&p
zk>)_7=6UAWvw8Io&wyvZGvFEU40r}S1D*lTfM>un;2H1?cm^Iq26)WaoyQsb>IpcH
z|Nq<H|975Z>_^}`;0xe0;8WluU<z>H0JsTU117*Tz&3Dii?LsUpMcMSw}5NFRbU(V
z`8;Dk0IvhD0Da&w;EyMv1AGH~0w~~Z;3{wl*aE&jhrNJr&ocHZa1Qt#zW)Mz57={j
z3Had|@C<kcJOiEq&wyv(p=5wCDlMX=EKSAJNW_WIA(^rgM9j{%Gb)coNISa9cAT7u
z8>6DbBVH!ZQ0UHV<b9e-u0<SSflx-~Dx^72wdm@ZRRME7l`GMsKJ{#O=U}bSd#Raf
zREYA*M6t|T2PeTkwDcmc(wNZ8sin4i(UCL@DoPQdpVOUX*eKG`LMY0r)W|`~SE3}_
z^GQ<M<Hz*8T^waNV@huNh4s!-{YM)&o`*CU2YU}X?P<?x!+5`Q{yV$5e%mH>8ls)3
z&WfF4m4x=aNGsiDn{&JrTE@uJZbG?mwRz}_)sl{_q;O{XgwdLa>Uk}*BDI<6CR46W
zr1CQ3seBi0#x`|d+<0$SZU~Xc^`2JRYnQ}jULb}cjnVtu-GI-~z}#537oeT&$`g@R
ztvnWmS%kFLzAt1P3zwC+c+VwBb=(ZP@{i@ne6Iew1df+=Dh4lKX}zfL#qoTZiodyz
zZGu7_9*35X17V13S31b##Fx2@iBpklGr-+c!W`u#dcz6hSlAlc2!5W7*t75Pa<-60
z%uH2^VO<V}8xeco8Ql<tbZWhBWcQsV=SDYvYX`f+vJNUy6VbrwZiI8UmaPd#p~IqX
zMGNuMxN|(b5-FacnxdIKLN|N1mo0-%OFOgm(Qsw7T|1yii*z2@Pg9iXrX${Iua#XZ
wJd4GgqhMOKKHaR8k}4sy5OwLSi=`{LR+i#>KxFbZ#oWNGp<L1R8@Gn+PyJzLy8r+H

diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
index 1f88ad529..bca1428b5 100644
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -1,17 +1,200 @@
 # Running the mpnn model with the largemix dataset on IPU.
 
-defaults:
- - base_config: ogbpcqm4mv2_GPS++
- - _self_
+# @package _global_
 
 constants:
-  name: ogb_pcqm4mv2_mpnn_GPS++
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
 
 architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
 
   pre_nn_edges:
     out_dim: 128
-    hidden_dims: 32
+    hidden_dims: 512
     depth: 2
     activation: relu
     last_activation: none
@@ -20,28 +203,92 @@ architecture:
     last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
   gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 1024 # should be consistent with pre_nn.out_dim
-  #    out_dim: 256
-  #    hidden_dims: &gnn_dim 64 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  #    depth: 4
-  #    activation: gelu
-  #    last_activation: none
-  #    dropout: 0.1
-  #    normalization: "layer_norm"
-  #    last_normalization: *normalization
-  #    residual_type: simple
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
     layer_type: 'pyg:gps'
     layer_kwargs:
       node_residual: false
       mpnn_type: 'pyg:mpnnplus'
       mpnn_kwargs:
-        in_dim: 1024 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 1024 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
         in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
         out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
       attn_type: "none" # "full-attention", "none"
       # biased_attention: false
       attn_kwargs: null
-    virtual_node: 'sum'
-    use_virtual_edges: true  
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none

From 3c1b217c304444c34d8fbd7fcdf4a99c85a2559f Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 8 Sep 2023 15:10:50 +0000
Subject: [PATCH 11/58] added stochastic depth to config

---
 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
index bca1428b5..62ca5a614 100644
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -238,6 +238,7 @@ architecture:
     activation: gelu
     last_activation: none
     dropout: 0.1
+    droppath_rate_ffn: 0.3
     normalization: "layer_norm"
     last_normalization: *normalization
     residual_type: simple

From dbc574fcea2a41d5611fd3b62ab04626042df9e5 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 8 Sep 2023 15:32:09 +0000
Subject: [PATCH 12/58] fixed config

---
 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
index 62ca5a614..3141747d4 100644
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -77,7 +77,7 @@ datamodule:
         split_names: ["train", "valid", "test-dev"]
         seed: ${constants.seed}
         label_normalization:
-          normalize_val_test: true
+          normalize_val_test: false
           method: "normal"
 
     # Featurization
@@ -238,7 +238,6 @@ architecture:
     activation: gelu
     last_activation: none
     dropout: 0.1
-    droppath_rate_ffn: 0.3
     normalization: "layer_norm"
     last_normalization: *normalization
     residual_type: simple
@@ -256,6 +255,7 @@ architecture:
       attn_type: "none" # "full-attention", "none"
       # biased_attention: false
       attn_kwargs: null
+      droppath_rate_ffn: 0.3
 
   graph_output_nn:
     graph:

From ef5db7f94ab25c88ffdc5ac3749156312d386535 Mon Sep 17 00:00:00 2001
From: DomInvivo <dominique@invivoai.com>
Date: Sat, 9 Sep 2023 21:20:28 -0400
Subject: [PATCH 13/58] Track learning rate with callback. `n_epochs` redundant
 with the `epochs` tracked by lightning.

---
 graphium/config/_loader.py    | 7 ++++++-
 graphium/trainer/predictor.py | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index da55a9266..5a6754c54 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -13,7 +13,7 @@
 
 # Lightning
 from lightning import Trainer
-from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
+from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
 from lightning.pytorch.loggers import Logger, WandbLogger
 from loguru import logger
 
@@ -415,6 +415,11 @@ def load_trainer(
     if "model_checkpoint" in cfg_trainer.keys():
         callbacks.append(ModelCheckpoint(**cfg_trainer["model_checkpoint"]))
 
+    if "learning_rate_monitor" in cfg_trainer.keys():
+        callbacks.append(LearningRateMonitor(**cfg_trainer["learning_rate_monitor"]))
+    else:
+        callbacks.append(LearningRateMonitor())
+
     # Define the logger parameters
     wandb_cfg = config["constants"].get("wandb")
     if wandb_cfg is not None:
diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index c4e700895..6824a40df 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -618,11 +618,6 @@ def on_validation_epoch_end(self) -> None:
         concatenated_metrics_logs = self.task_epoch_summary.concatenate_metrics_logs(metrics_logs)
         concatenated_metrics_logs["val/mean_time"] = torch.tensor(self.mean_val_time_tracker.mean_value)
         concatenated_metrics_logs["val/mean_tput"] = self.mean_val_tput_tracker.mean_value
-
-        if hasattr(self.optimizers(), "param_groups"):
-            lr = self.optimizers().param_groups[0]["lr"]
-            concatenated_metrics_logs["lr"] = torch.tensor(lr)
-        concatenated_metrics_logs["n_epochs"] = torch.tensor(self.current_epoch, dtype=torch.float32)
         self.log_dict(concatenated_metrics_logs)
 
         # Save yaml file with the per-task metrics summaries

From 5cfd705d22808a7c082499da744616bf2dbe3632 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Mon, 11 Sep 2023 16:09:34 +0000
Subject: [PATCH 14/58] added PCQM4Mv2 GCN and GINE configs

---
 .../config_ogbpcq_GCN_16layers.yaml           | 263 +++++++++++++++++
 .../config_ogbpcq_GCN_4layers.yaml            | 263 +++++++++++++++++
 .../config_ogbpcq_GINE_16layers.yaml          | 273 ++++++++++++++++++
 .../config_ogbpcq_GINE_4layers.yaml           | 273 ++++++++++++++++++
 .../config_ogbpcq_mpnn_GPS++.yaml             |   2 +-
 5 files changed, 1073 insertions(+), 1 deletion(-)
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml

diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
new file mode 100644
index 000000000..bffa2ee04
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
@@ -0,0 +1,263 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_gcn
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_gcn
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 768
+    hidden_dims: *gnn_dim
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
new file mode 100644
index 000000000..ef46cda2a
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
@@ -0,0 +1,263 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_gcn
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_gcn
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 2
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+      #  accelerator_kwargs:
+      #    _accelerator: "ipu"
+      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 768
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
new file mode 100644
index 000000000..d53dd60b8
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
@@ -0,0 +1,273 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_gine
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_gine
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:   # Set as null to avoid a pre-nn network
+    out_dim: 32
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: *dropout
+    normalization: *normalization
+    last_normalization: *normalization
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    out_dim: &gnn_dim 704
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml
new file mode 100644
index 000000000..ff173adde
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml
@@ -0,0 +1,273 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_gine
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_gine
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 2
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+      #  accelerator_kwargs:
+      #    _accelerator: "ipu"
+      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "cxsmiles"
+        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:   # Set as null to avoid a pre-nn network
+    out_dim: 32
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: *dropout
+    normalization: *normalization
+    last_normalization: *normalization
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    out_dim: &gnn_dim 704
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
index 3141747d4..b9fcfc780 100644
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -77,7 +77,7 @@ datamodule:
         split_names: ["train", "valid", "test-dev"]
         seed: ${constants.seed}
         label_normalization:
-          normalize_val_test: false
+          normalize_val_test: true
           method: "normal"
 
     # Featurization

From 1cc9c0c981ba59cf256936547e8e29accf68e0cc Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Thu, 14 Sep 2023 10:43:33 +0000
Subject: [PATCH 15/58] PCQM4Mv2 configs with NEW DATA

---
 .../config_ogbpcq_GINE_16layers.yaml          |   2 +-
 .../config_ogbpcq_mpnn_GPS++.yaml             |   8 +-
 .../config_ogbpcq_mpnn_GPS++_newDATA.yaml     | 295 ++++++++++++++++++
 ...fig_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml | 295 ++++++++++++++++++
 4 files changed, 595 insertions(+), 5 deletions(-)
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
 create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml

diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
index d53dd60b8..bde17fe0d 100644
--- a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
@@ -234,7 +234,7 @@ architecture:
   gnn:  # Set as null to avoid a post-nn network
     out_dim: &gnn_dim 704
     hidden_dims: *gnn_dim
-    depth: 4
+    depth: 16
     activation: gelu
     last_activation: none
     dropout: 0.1
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
index b9fcfc780..61f645c44 100644
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
@@ -7,10 +7,10 @@ constants:
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
   datacache_path: "/localdata/PCQM4Mv2/"
   epochs: 100
-  name: ogb_pcqm4mv2_mpnn
+  name: ogb_pcqm4mv2_mpnn_no1hot
   wandb:
     entity: multitask-gnn
-    name: neurips2023_scaling_mpnn
+    name: neurips2023_scaling_mpnn_no1hot
     project: neurips2023_graphcore_scaling_mpnn
 
 
@@ -91,8 +91,8 @@ datamodule:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
     # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+    #  atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [atomic-number, group, period, total-valence, degree, formal-charge, radical-electron, aromatic, in-ring]
       # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
       edge_property_list: [bond-type-onehot, stereo, in-ring]
       add_self_loop: False
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
new file mode 100644
index 000000000..b97402680
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
@@ -0,0 +1,295 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_mpnn_NewData
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn_NewData
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "smiles"
+        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml
new file mode 100644
index 000000000..94535b7cc
--- /dev/null
+++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml
@@ -0,0 +1,295 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_mpnn_NewData_4layers
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn_NewData_4layers
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 2
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(1)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+      #  accelerator_kwargs:
+      #    _accelerator: "ipu"
+      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "smiles"
+        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none

From 6a632d50036a02783fb007473b98b84b111835b1 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 15 Sep 2023 11:10:07 +0000
Subject: [PATCH 16/58] added configs for LargeMix MPNN foundation model

---
 expts/foundation_model/__init__.py            |   0
 .../config_LargeMix_mpnn_GPS++.yaml           | 462 ++++++++++++++++++
 .../config_ogbpcq_mpnn_GPS++_newDATA.yaml     | 295 +++++++++++
 3 files changed, 757 insertions(+)
 create mode 100644 expts/foundation_model/__init__.py
 create mode 100644 expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml

diff --git a/expts/foundation_model/__init__.py b/expts/foundation_model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..dcacb5371
--- /dev/null
+++ b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,462 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_mpnn_NewData
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn_NewData
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: True # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+      #  task_heads:
+      #    homolumo:
+      #      task_level: graph
+      #      out_dim: 1
+      #      hidden_dims: 256
+      #      depth: 2
+      #      activation: relu
+      #      last_activation: none
+      #      dropout: *dropout
+      #      normalization: *normalization
+      #      last_normalization: "none"
+      #      residual_type: none
diff --git a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml
new file mode 100644
index 000000000..b97402680
--- /dev/null
+++ b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml
@@ -0,0 +1,295 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/PCQM4Mv2/"
+  epochs: 100
+  name: ogb_pcqm4mv2_mpnn_NewData
+  wandb:
+    entity: multitask-gnn
+    name: neurips2023_scaling_mpnn_NewData
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 8
+
+  ipu_config:
+    - deviceIterations(60) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      homolumo:
+        df: null
+        task_level: graph
+        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
+        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
+        smiles_col: "smiles"
+        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
+        #        sample_size: 100000 # use sample_size for test
+        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
+        seed: ${constants.seed}
+        label_normalization:
+          normalize_val_test: true
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: "disk"
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: []
+  loss_fun:
+    homolumo: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: ${constants.epochs}
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  homolumo:
+      - name: mae
+        metric: mae_ipu
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+        threshold_kwargs: null
+      - name: pearsonr
+        metric: pearsonr_ipu
+        threshold_kwargs: null
+        target_nan_mask: null
+        multitask_handling: mean-per-label
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/ogb_pcqm4mv2/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${constants.epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 5
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: 256
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: 256
+    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    homolumo:
+      task_level: graph
+      out_dim: 1
+      hidden_dims: 256
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none

From de949f96f243dc9daba779784f2a1ff01305d6e3 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 15 Sep 2023 13:57:41 +0000
Subject: [PATCH 17/58] small config

---
 expts/foundation_model/small.yaml | 346 ++++++++++++++++++++++++++++++
 1 file changed, 346 insertions(+)
 create mode 100644 expts/foundation_model/small.yaml

diff --git a/expts/foundation_model/small.yaml b/expts/foundation_model/small.yaml
new file mode 100644
index 000000000..739ecb24e
--- /dev/null
+++ b/expts/foundation_model/small.yaml
@@ -0,0 +1,346 @@
+# @package _global_
+
+constants:
+  seed: &seed 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
+  name: small_test
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 80
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 80
+        # Data handling-related
+        batch_size_training: 50
+        batch_size_inference: 50
+    predictor:
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16
+        accumulate_grad_batches: 16
+
+  ipu_config:
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 128)
+    - Precision.enableStochasticRounding(True)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4, 4, 4, 4]      
+# accelerator:
+#   type: cpu  # cpu or ipu or gpu
+#   config_override:
+#     datamodule:
+#       batch_size_training: 64
+#       batch_size_inference: 256
+#     trainer:
+#       trainer:
+#         precision: 32
+#         accumulate_grad_batches: 1
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      qm9:
+        df: null
+        df_path: data/neurips2023/small-dataset/qm9.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt`
+        seed: *seed
+        task_level: graph
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+
+      tox21:
+        df: null
+        df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt`
+        seed: *seed
+        task_level: graph
+
+      zinc:
+        df: null
+        df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["SA", "logp", "score"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt`
+        seed: *seed
+        task_level: graph
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 30 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null   # Set as null to avoid a pre-nn network
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    layer_type: 'pyg:gcn'
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 96
+    hidden_dims: *gnn_dim
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    qm9:
+      task_level: graph
+      out_dim: 19
+      hidden_dims: 128
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    tox21:
+      task_level: graph
+      out_dim: 12
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: sigmoid
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    zinc:
+      task_level: graph
+      out_dim: 3
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    qm9: ["mae"]
+    tox21: ["auroc"]
+    zinc: ["mae"]
+  loss_fun:
+    qm9: mae_ipu
+    tox21: bce_ipu
+    zinc: mae_ipu
+  random_seed: *seed
+  optim_kwargs:
+    lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  qm9: &qm9_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: flatten
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2_score
+      metric: r2_score_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  tox21:
+    - name: auroc
+      metric: auroc_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: average_precision_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: f1 > 0.5
+      metric: f1
+      multitask_handling: mean-per-label
+      target_to_int: True
+      num_classes: 2
+      average: micro
+      threshold_kwargs: &threshold_05
+        operator: greater
+        threshold: 0.5
+        th_on_preds: True
+        th_on_target: True
+    - name: precision > 0.5
+      metric: precision
+      multitask_handling: mean-per-label
+      average: micro
+      threshold_kwargs: *threshold_05
+  zinc: *qm9_metrics
+
+trainer:
+  seed: *seed
+  logger:
+    save_dir: logs/neurips2023-small/
+    name: ${constants.name}
+    project: ${constants.name}
+  #early_stopping:
+  #  monitor: *monitor
+  #  min_delta: 0
+  #  patience: 10
+  #  mode: &mode min
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: *max_epochs
+    min_epochs: 1
+    check_val_every_n_epoch: 20

From 4a83e1288739aae8a94c5392e6fe6df6286c0a7c Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Fri, 15 Sep 2023 16:01:58 +0000
Subject: [PATCH 18/58] cleaned up configs foundation model

---
 .../config_LargeMix_mpnn_GPS++.yaml           |  16 +-
 expts/foundation_model/small.yaml             | 346 ------------------
 2 files changed, 8 insertions(+), 354 deletions(-)
 delete mode 100644 expts/foundation_model/small.yaml

diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
index dcacb5371..ce6914154 100644
--- a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
@@ -5,12 +5,12 @@
 constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
+  datacache_path: "/localdata/neurips2023-large/"
   epochs: 100
-  name: ogb_pcqm4mv2_mpnn_NewData
+  name: LargeMix_mpnn_40M
   wandb:
     entity: multitask-gnn
-    name: neurips2023_scaling_mpnn_NewData
+    name: LargeMix_mpnn_40M
     project: neurips2023_graphcore_scaling_mpnn
 
 
@@ -28,8 +28,8 @@ accelerator:
           max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
           max_num_edges_per_graph: 100
         # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
+        batch_size_training: 16
+        batch_size_inference: 16
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
@@ -37,7 +37,7 @@ accelerator:
     trainer:
       trainer:
         precision: 16-true
-        accumulate_grad_batches: 8
+        accumulate_grad_batches: 16
 
   ipu_config:
     - deviceIterations(30) # IPU would require large batches to be ready for the model.
@@ -58,7 +58,7 @@ accelerator:
 
   accelerator_kwargs:
     _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
+    gnn_layers_per_ipu: [4,4,4,4]      
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
@@ -170,7 +170,7 @@ datamodule:
             ksteps: 16
 
     num_workers: 32 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
+    persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
 #Task-specific
diff --git a/expts/foundation_model/small.yaml b/expts/foundation_model/small.yaml
deleted file mode 100644
index 739ecb24e..000000000
--- a/expts/foundation_model/small.yaml
+++ /dev/null
@@ -1,346 +0,0 @@
-# @package _global_
-
-constants:
-  seed: &seed 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  entity: multitask-gnn
-  name: small_test
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 80
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 80
-        # Data handling-related
-        batch_size_training: 50
-        batch_size_inference: 50
-    predictor:
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16
-        accumulate_grad_batches: 16
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 128)
-    - Precision.enableStochasticRounding(True)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       batch_size_training: 64
-#       batch_size_inference: 256
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      qm9:
-        df: null
-        df_path: data/neurips2023/small-dataset/qm9.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt`
-        seed: *seed
-        task_level: graph
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-
-      tox21:
-        df: null
-        df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt`
-        seed: *seed
-        task_level: graph
-
-      zinc:
-        df: null
-        df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["SA", "logp", "score"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt`
-        seed: *seed
-        task_level: graph
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 30 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null   # Set as null to avoid a pre-nn network
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    layer_type: 'pyg:gcn'
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 96
-    hidden_dims: *gnn_dim
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    qm9:
-      task_level: graph
-      out_dim: 19
-      hidden_dims: 128
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    tox21:
-      task_level: graph
-      out_dim: 12
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: sigmoid
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    zinc:
-      task_level: graph
-      out_dim: 3
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    qm9: ["mae"]
-    tox21: ["auroc"]
-    zinc: ["mae"]
-  loss_fun:
-    qm9: mae_ipu
-    tox21: bce_ipu
-    zinc: mae_ipu
-  random_seed: *seed
-  optim_kwargs:
-    lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  qm9: &qm9_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: flatten
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2_score
-      metric: r2_score_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  tox21:
-    - name: auroc
-      metric: auroc_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: average_precision_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: f1 > 0.5
-      metric: f1
-      multitask_handling: mean-per-label
-      target_to_int: True
-      num_classes: 2
-      average: micro
-      threshold_kwargs: &threshold_05
-        operator: greater
-        threshold: 0.5
-        th_on_preds: True
-        th_on_target: True
-    - name: precision > 0.5
-      metric: precision
-      multitask_handling: mean-per-label
-      average: micro
-      threshold_kwargs: *threshold_05
-  zinc: *qm9_metrics
-
-trainer:
-  seed: *seed
-  logger:
-    save_dir: logs/neurips2023-small/
-    name: ${constants.name}
-    project: ${constants.name}
-  #early_stopping:
-  #  monitor: *monitor
-  #  min_delta: 0
-  #  patience: 10
-  #  mode: &mode min
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: *max_epochs
-    min_epochs: 1
-    check_val_every_n_epoch: 20

From 98d3d5be016bc6d86dbe8d971f6e685f209b0e2d Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Mon, 18 Sep 2023 09:54:43 +0000
Subject: [PATCH 19/58] added 50/100/200/400M configs for foundation model

---
 .../100M_config_LargeMix_mpnn_GPS++.yaml      | 462 +++++++++++++++++
 .../200M_config_LargeMix_mpnn_GPS++.yaml      | 462 +++++++++++++++++
 .../400M_config_LargeMix_mpnn_GPS++.yaml      | 463 ++++++++++++++++++
 .../50M_config_LargeMix_mpnn_GPS++.yaml       | 462 +++++++++++++++++
 4 files changed, 1849 insertions(+)
 create mode 100644 expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml

diff --git a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..c8eefd4d2
--- /dev/null
+++ b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,462 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: LargeMix_mpnn_100M
+  wandb:
+    entity: multitask-gnn
+    name: LargeMix_mpnn_100M
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 50 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 55 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 12
+        batch_size_inference: 12
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 12
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4,4,4,4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: &gnn_dim 428
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: *gnn_dim
+    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+      #  task_heads:
+      #    homolumo:
+      #      task_level: graph
+      #      out_dim: 1
+      #      hidden_dims: 256
+      #      depth: 2
+      #      activation: relu
+      #      last_activation: none
+      #      dropout: *dropout
+      #      normalization: *normalization
+      #      last_normalization: "none"
+      #      residual_type: none
diff --git a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..298d68109
--- /dev/null
+++ b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,462 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: LargeMix_mpnn_200M
+  wandb:
+    entity: multitask-gnn
+    name: LargeMix_mpnn_200M
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 116
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 116
+        # Data handling-related
+        batch_size_training: 8
+        batch_size_inference: 8
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 16
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(2)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: &gnn_dim 628
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: *gnn_dim
+    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+      #  task_heads:
+      #    homolumo:
+      #      task_level: graph
+      #      out_dim: 1
+      #      hidden_dims: 256
+      #      depth: 2
+      #      activation: relu
+      #      last_activation: none
+      #      dropout: *dropout
+      #      normalization: *normalization
+      #      last_normalization: "none"
+      #      residual_type: none
diff --git a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..08820d330
--- /dev/null
+++ b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,463 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: LargeMix_mpnn_400M
+  wandb:
+    entity: multitask-gnn
+    name: LargeMix_mpnn_400M
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 70 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 150
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 70 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 150
+        # Data handling-related
+        batch_size_training: 4
+        batch_size_inference: 4
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 32
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]      
+    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: &gnn_dim 910
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: *gnn_dim
+    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+      #  task_heads:
+      #    homolumo:
+      #      task_level: graph
+      #      out_dim: 1
+      #      hidden_dims: 256
+      #      depth: 2
+      #      activation: relu
+      #      last_activation: none
+      #      dropout: *dropout
+      #      normalization: *normalization
+      #      last_normalization: "none"
+      #      residual_type: none
diff --git a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..b75d7d2e0
--- /dev/null
+++ b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,462 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: LargeMix_mpnn_50M
+  wandb:
+    entity: multitask-gnn
+    name: LargeMix_mpnn_50M
+    project: neurips2023_graphcore_scaling_mpnn
+
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 16
+        batch_size_inference: 16
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 16
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [4,4,4,4]      
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: &gnn_dim 282
+    hidden_dims: 1024
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: 128
+    hidden_dims: 512
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: *gnn_dim
+    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
+      #  task_heads:
+      #    homolumo:
+      #      task_level: graph
+      #      out_dim: 1
+      #      hidden_dims: 256
+      #      depth: 2
+      #      activation: relu
+      #      last_activation: none
+      #      dropout: *dropout
+      #      normalization: *normalization
+      #      last_normalization: "none"
+      #      residual_type: none

From 6d7f18b26f0031d3f78e86c187c05890397126ec Mon Sep 17 00:00:00 2001
From: dominique <dominique@valencediscovery.com>
Date: Mon, 18 Sep 2023 21:11:34 -0400
Subject: [PATCH 20/58] Added toymix baselines of GatedGCN and MPNN++

---
 docs/baseline.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/docs/baseline.md b/docs/baseline.md
index 029996554..8c86ca067 100644
--- a/docs/baseline.md
+++ b/docs/baseline.md
@@ -11,18 +11,24 @@ One can observe that the smaller datasets (`Zinc12k` and `Tox21`) beneficiate fr
 | **QM9**   | GCN   | 0.102 ± 0.0003 | 0.958 ± 0.0007 | 0.920 ± 0.002 | 0.119 ± 0.01 | 0.955 ± 0.001 | 0.915 ± 0.001 |
 |           | GIN   | 0.0976 ± 0.0006 | **0.959 ± 0.0002** | **0.922 ± 0.0004** | 0.117 ± 0.01 | 0.950 ± 0.002 | 0.908 ± 0.003 |
 |           | GINE  | **0.0959 ± 0.0002** | 0.955 ± 0.002 | 0.918 ± 0.004 | 0.102 ± 0.01 | 0.956 ± 0.0009 | 0.918 ± 0.002 |
-|
-| **Zinc12k** | GCN   | 0.348 ± 0.02 | 0.941 ± 0.002 | 0.863 ± 0.01 | 0.226 ± 0.004 | 0.973 ± 0.0005 | 0.940 ± 0.003 |
+|       |   GatedGCN    |       |       |       | 0.1212 ± 0.0009 | 0.9457 ± 0.0002 | 0.8964 ± 0.0006 |
+|       |   MPNN++ (sum)    |       |       |    | 0.1174 ± 0.0012 | 0.9460 ± 0.0005 | 0.8989 ± 0.0008 |
+ **Zinc12k** | GCN   | 0.348 ± 0.02 | 0.941 ± 0.002 | 0.863 ± 0.01 | 0.226 ± 0.004 | 0.973 ± 0.0005 | 0.940 ± 0.003 |
 |           | GIN   | 0.303 ± 0.007 | 0.950 ± 0.003 | 0.889 ± 0.003 | 0.189 ± 0.004 | 0.978 ± 0.006 | 0.953 ± 0.002 |
-|           | GINE  | 0.266 ± 0.02 | 0.961 ± 0.003 | 0.915 ± 0.01 | **0.147 ± 0.009** | **0.987 ± 0.001** | **0.971 ± 0.003** |
+|           | GINE  | 0.266 ± 0.02 | 0.961 ± 0.003 | 0.915 ± 0.01 | 0.147 ± 0.009 | 0.987 ± 0.001 | 0.971 ± 0.003 |
+|       | GatedGCN       |       |       |       | 0.1282 ± 0.0045 | 0.9850 ± 0.0006 | 0.9639 ± 0.0024 |
+|       | MPNN++ (sum)   |       |       |       | **0.1002 ± 0.0025** | **0.9909 ± 0.0004** | **0.9777 ± 0.0014** |
 
 |           |       | BCE ↓     | AUROC ↑ | AP ↑     | BCE ↓   | AUROC ↑ | AP ↑   |
 |-----------|-------|-----------|-----------|-----------|---------|-----------|---------|
 |    | <th colspan="3" style="text-align: center;">Single-Task Model</th>  <th colspan="3" style="text-align: center;">Multi-Task Model</th>   |
 |
-| **Tox21**   | GCN   | 0.202 ± 0.005 | 0.773 ± 0.006 | 0.334 ± 0.03 | **0.176 ± 0.001** | **0.850 ± 0.006** | 0.446 ± 0.01 |
+| **Tox21**   | GCN   | 0.202 ± 0.005 | 0.773 ± 0.006 | 0.334 ± 0.03 | 0.176 ± 0.001 | 0.850 ± 0.006 | 0.446 ± 0.01 |
 |           | GIN   | 0.200 ± 0.002 | 0.789 ± 0.009 | 0.350 ± 0.01 | 0.176 ± 0.001 | 0.841 ± 0.005 | 0.454 ± 0.009 |
-|           | GINE  | 0.201 ± 0.007 | 0.783 ± 0.007 | 0.345 ± 0.02 | 0.177 ± 0.0008 | 0.836 ± 0.004 | **0.455 ± 0.008** |
+|           | GINE  | 0.201 ± 0.007 | 0.783 ± 0.007 | 0.345 ± 0.02 | 0.177 ± 0.0008 | 0.836 ± 0.004 | 0.455 ± 0.008 |
+|       | GatedGCN       |       |       |       | 0.1733 ± 0.0015 | 0.8522 ± 0.0022 | **0.4620 ± 0.0118** |
+|       | MPNN++ (sum)   |       |       |       | **0.1725 ± 0.0012** | **0.8569 ± 0.0005** | 0.4598 ± 0.0044 |
+
 
 # LargeMix Baseline
 ## LargeMix test set metrics

From 01c40fc15aee049e5b6d4add8a1056a087cc85df Mon Sep 17 00:00:00 2001
From: dominique <dominique@valencediscovery.com>
Date: Mon, 18 Sep 2023 21:23:40 -0400
Subject: [PATCH 21/58] Adapted text for new baselines

---
 docs/baseline.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/baseline.md b/docs/baseline.md
index 8c86ca067..6ddb0b86c 100644
--- a/docs/baseline.md
+++ b/docs/baseline.md
@@ -4,6 +4,8 @@ From the paper to be released soon. Below, you can see the baselines for the `To
 
 One can observe that the smaller datasets (`Zinc12k` and `Tox21`) beneficiate from adding another unrelated task (`QM9`), where the labels are computed from DFT simulations.
 
+**NEW baselines added 2023/09/18**: Multitask baselines have been added for GatedGCN and MPNN++ (sum aggretator) using 3 random seeds. They achieve the best performance by a significant margin on Zinc12k and Tox21, while sacrificing a little on QM9.
+
 | Dataset   | Model | MAE ↓     | Pearson ↑ | R² ↑     | MAE ↓   | Pearson ↑ | R² ↑   |
 |-----------|-------|-----------|-----------|-----------|---------|-----------|---------|
 |    | <th colspan="3" style="text-align: center;">Single-Task Model</th>  <th colspan="3" style="text-align: center;">Multi-Task Model</th>   |

From e2551f26126d91f8d4a6f74f375d64b6a32f7d42 Mon Sep 17 00:00:00 2001
From: dominique <dominique@valencediscovery.com>
Date: Mon, 18 Sep 2023 22:03:44 -0400
Subject: [PATCH 22/58] Added new largemix baselines

---
 docs/baseline.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/docs/baseline.md b/docs/baseline.md
index 6ddb0b86c..cac1ee282 100644
--- a/docs/baseline.md
+++ b/docs/baseline.md
@@ -96,6 +96,40 @@ This is not surprising as they contain two orders of magnitude more datapoints a
 |             | GIN   | 0.1873 ± 0.0033 | **0.1701 ± 0.0142** |
 |             | GINE  | 0.1883 ± 0.0039 | **0.1771 ± 0.0010** |
 
+## NEW: Largemix improved sweep - 2023/08-18
+
+Unsatisfied with the prior results, we ran a bayesian search over a broader set of parameters, and including only more expressive models, namely GINE, GatedGCN and MPNN++. We further increase the number of parameters to 10M due to evidence of underfitting. We evaluate only the multitask setting.
+
+We observe a significant improvement over all tasks, with a very notable r2-score increase of +0.53 (0.27 -> 0.80) compared to the best node-level property prediction on PCQM4M_N4.
+
+The results are reported below over 1 seed. We are currently running more seeds of the same models.
+
+| Dataset       | Model          | MAE ↓     | Pearson ↑ | R² ↑     |
+|---------------|----------------|--------|---------|--------|
+| **PCQM4M_G25**    | GINE           | 0.2250 | 0.8840  | 0.7911 |
+|               | GatedGCN       | 0.2457 | 0.8698  | 0.7688 |
+|               | MPNN++ (sum)   | 0.2269 | 0.8802  | 0.7855 |
+|
+| **PCQM4M_N4**     | GINE           | 0.2699 | 0.8475  | 0.7182 |
+|               | GatedGCN       | 0.3337 | 0.8102  | 0.6566 |
+|               | MPNN++ (sum)   | 0.2114 | 0.8942  | 0.8000 |
+
+| Dataset       | Model          | BCE ↓     | AUROC ↑ | AP ↑     |
+|---------------|----------------|--------|---------|--------|
+| **PCBA_1328**     | GINE           | 0.0334 | 0.7879  | 0.2808 |
+|               | GatedGCN       | 0.0351 | 0.7788  | 0.2611 |
+|               | MPNN++ (sum)   | 0.0344 | 0.7815  | 0.2666 |
+|
+| **L1000_VCAP**    | GINE           | 0.1907 | 0.6416  | 0.4042 |
+|               | GatedGCN       | 0.1866 | 0.6395  | 0.4092 |
+|               | MPNN++ (sum)   | 0.1867 | 0.6478  | 0.4131 |
+|
+| **L1000_MCF7**    | GINE           | 0.1931 | 0.6352  | 0.4235 |
+|               | GatedGCN       | 0.1859 | 0.6547  | 0.4224 |
+|               | MPNN++ (sum)   | 0.1870 | 0.6593  | 0.4254 |
+
+
+
 # UltraLarge Baseline
 
 ## UltraLarge test set metrics

From 1188f97651bfcb4c2a46ecaab4e288c726f8427c Mon Sep 17 00:00:00 2001
From: Maciej Sypetkowski <maciej.sypetkowski@recursionpharma.com>
Date: Fri, 22 Sep 2023 08:40:52 -0600
Subject: [PATCH 23/58] Minor fixes

---
 graphium/cli/train_finetune_test.py               |  3 +++
 graphium/nn/architectures/encoder_manager.py      |  2 ++
 graphium/nn/architectures/global_architectures.py | 13 +++++++++++--
 graphium/nn/encoders/laplace_pos_encoder.py       |  5 +++--
 graphium/nn/pyg_layers/gps_pyg.py                 |  6 ++++--
 graphium/nn/pyg_layers/mpnn_pyg.py                |  7 ++++---
 graphium/utils/spaces.py                          |  5 ++++-
 7 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index ffb5a7512..885839ec0 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -5,6 +5,7 @@
 
 import fsspec
 import hydra
+import numpy as np
 import torch
 import wandb
 import yaml
@@ -40,6 +41,8 @@
 
 TESTING_ONLY_CONFIG_KEY = "testing_only"
 
+OmegaConf.register_new_resolver("eval", lambda x: eval(x, {"np": np}))
+
 
 @hydra.main(version_base=None, config_path="../../expts/hydra-configs", config_name="main")
 def cli(cfg: DictConfig) -> None:
diff --git a/graphium/nn/architectures/encoder_manager.py b/graphium/nn/architectures/encoder_manager.py
index e3e48aeba..464d9e9cc 100644
--- a/graphium/nn/architectures/encoder_manager.py
+++ b/graphium/nn/architectures/encoder_manager.py
@@ -135,6 +135,8 @@ def _initialize_positional_encoders(self, pe_encoders_kwargs: Dict[str, Any]) ->
                 if pe_out_dim2 is not None:
                     assert edge_pe_out_dim == pe_out_dim2, f"values mismatch {pe_out_dim}!={pe_out_dim2}"
                 pe_encoders[encoder_name] = encoder(out_dim=edge_pe_out_dim, **this_in_dims, **encoder_kwargs)
+            else:
+                pe_encoders[encoder_name] = encoder(**this_in_dims, **encoder_kwargs)
 
         return pe_encoders
 
diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py
index a90a2cba9..75dfa7b38 100644
--- a/graphium/nn/architectures/global_architectures.py
+++ b/graphium/nn/architectures/global_architectures.py
@@ -422,6 +422,7 @@ def __init__(
         residual_skip_steps: int = 1,
         in_dim_edges: int = 0,
         hidden_dims_edges: List[int] = [],
+        out_dim_edges: int = 0,
         name: str = "GNN",
         layer_kwargs: Optional[Dict] = None,
         virtual_node: str = "none",
@@ -509,6 +510,10 @@ def __init__(
                 Hidden dimensions for the edges. Most models don't support it, so it
                 should only be used for those that do, i.e. `GatedGCNLayer`
 
+            out_dim_edges:
+                Output edge-feature dimensions of the network. Keep at 0 if not using
+                edge features, or if the layer doesn't support edges.
+
             name:
                 Name attributed to the current network, for display and printing
                 purposes.
@@ -545,6 +550,7 @@ def __init__(
 
         # Initialize the additional attributes
         self.in_dim_edges = in_dim_edges
+        self.out_dim_edges = out_dim_edges
         if isinstance(hidden_dims_edges, int):
             self.hidden_dims_edges = [hidden_dims_edges] * (depth - 1)
         elif len(hidden_dims_edges) == 0:
@@ -553,8 +559,9 @@ def __init__(
             self.hidden_dims_edges = list(hidden_dims_edges)
             assert depth is None
         self.full_dims_edges = None
-        if len(self.hidden_dims_edges) > 0:
-            self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [self.hidden_dims_edges[-1]]
+        if len(self.hidden_dims_edges) or out_dim_edges > 0:
+            assert out_dim_edges > 0, out_dim_edges
+            self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [out_dim_edges]
 
         self.virtual_node = virtual_node.lower() if virtual_node is not None else "none"
 
@@ -922,6 +929,7 @@ def get_init_kwargs(self) -> Dict[str, Any]:
         new_kwargs = dict(
             in_dim_edges=self.in_dim_edges,
             hidden_dims_edges=self.hidden_dims_edges,
+            out_dim_edges=self.out_dim_edges,
             virtual_node=self.virtual_node,
             use_virtual_edges=self.use_virtual_edges,
         )
@@ -953,6 +961,7 @@ def make_mup_base_kwargs(
             kwargs["in_dim_edges"] = round(kwargs["in_dim_edges"] / divide_factor)
         if not self.last_layer_is_readout:
             kwargs["out_dim"] = round(kwargs["out_dim"] / divide_factor)
+            kwargs["out_dim_edges"] = round(kwargs["out_dim_edges"] / divide_factor)
 
         def _recursive_divide_dim(x: collections.abc.Mapping):
             for k, v in x.items():
diff --git a/graphium/nn/encoders/laplace_pos_encoder.py b/graphium/nn/encoders/laplace_pos_encoder.py
index ccf642e9d..7cc69919b 100644
--- a/graphium/nn/encoders/laplace_pos_encoder.py
+++ b/graphium/nn/encoders/laplace_pos_encoder.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 from torch_geometric.data import Batch
 
-from graphium.nn.base_layers import MLP, get_norm, FCLayer
+from graphium.nn.base_layers import MLP, get_norm, FCLayer, TransformerEncoderLayerMup
 from graphium.nn.encoders.base_encoder import BaseEncoder
 
 
@@ -70,7 +70,8 @@ def __init__(
         if self.model_type == "Transformer":
             # Transformer model for LapPE
             model_kwargs.setdefault("nhead", 1)
-            encoder_layer = nn.TransformerEncoderLayer(
+            encoder_layer = TransformerEncoderLayerMup(
+                None,
                 d_model=hidden_dim,
                 batch_first=True,
                 dropout=dropout,
diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index f3da56979..b82fad782 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -240,7 +240,7 @@ def forward(self, batch: Batch) -> Batch:
     def _parse_mpnn_layer(self, mpnn_type, mpnn_kwargs: Dict[str, Any]) -> Optional[Module]:
         """Parse the MPNN layer."""
 
-        if mpnn_type is None:
+        if mpnn_type is None or mpnn_type == "none":
             return
 
         mpnn_kwargs = deepcopy(mpnn_kwargs)
@@ -375,7 +375,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) ->
         )
 
         attn_bias = None
-        if self.biased_attention_key is not None:
+        if self.biased_attention_key is not None and self.biased_attention_key != 'none':
             attn_bias = batch[self.biased_attention_key]
 
         # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim]
@@ -463,6 +463,8 @@ def layer_outputs_edges(self) -> bool:
             bool:
                 Always ``False`` for the current class
         """
+        if self.mpnn is None:
+            return False
         return self.mpnn.layer_outputs_edges
 
     @property
diff --git a/graphium/nn/pyg_layers/mpnn_pyg.py b/graphium/nn/pyg_layers/mpnn_pyg.py
index e11e0c9f3..73118c8f7 100644
--- a/graphium/nn/pyg_layers/mpnn_pyg.py
+++ b/graphium/nn/pyg_layers/mpnn_pyg.py
@@ -130,14 +130,15 @@ def __init__(
         self.num_edge_mlp = num_edge_mlp
         self.edge_dropout_rate = edge_dropout_rate
 
-        self.aggregator = MultiAggregation(aggregation_method)
+        self.aggregator = MultiAggregation(list(aggregation_method))
+        n_agg = len(aggregation_method)
 
         # node_model:
         edge_dim = self.out_dim_edges if use_edges else self.in_dim_edges
         if self.node_combine_method == "concat":
-            node_model_in_dim = 3 * self.in_dim + 2 * edge_dim
+            node_model_in_dim = (1 + 2 * n_agg) * self.in_dim + 2 * n_agg * edge_dim
         elif self.node_combine_method == "sum":
-            node_model_in_dim = 2 * self.in_dim + edge_dim
+            node_model_in_dim = (1 + n_agg) * self.in_dim + n_agg * edge_dim
         else:
             raise ValueError(f"node_combine_method {self.node_combine_method} not recognised.")
         node_model_hidden_dim = self.mlp_expansion_ratio * self.in_dim
diff --git a/graphium/utils/spaces.py b/graphium/utils/spaces.py
index 8ba7c4505..d821223a4 100644
--- a/graphium/utils/spaces.py
+++ b/graphium/utils/spaces.py
@@ -35,6 +35,7 @@
     "pyg:pna-msgpass": PygLayers.PNAMessagePassingPyg,
     "pyg:gps": PygLayers.GPSLayerPyg,
     "pyg:dimenet": PygLayers.DimeNetPyg,
+    "pyg:mpnnplus": PygLayers.MPNNPlusPyg,
 }
 
 LAYERS_DICT = deepcopy(FC_LAYERS_DICT)
@@ -51,6 +52,8 @@
 }
 
 LOSS_DICT = {
+    "bce": torch.nn.BCELoss,
+    "bce_logits": torch.nn.BCEWithLogitsLoss,
     "mse": torch.nn.MSELoss,
     "bce": torch.nn.BCELoss,
     "l1": torch.nn.L1Loss,
@@ -105,7 +108,7 @@
     "msle": TorchMetrics.mean_squared_log_error,
     "pearsonr": TorchMetrics.pearson_corrcoef,
     "spearmanr": TorchMetrics.spearman_corrcoef,
-    "r2": TorchMetrics.r2_score,
+    "r2_score": TorchMetrics.r2_score,
     "cosine": TorchMetrics.cosine_similarity,
     "pearsonr_ipu": Metrics.pearson_ipu,
     "spearmanr_ipu": Metrics.spearman_ipu,

From b3b4b87ddf92a5f43a966445579544ec3362dea7 Mon Sep 17 00:00:00 2001
From: shenyangHuang <shenyang.huang@mail.mcgill.ca>
Date: Sat, 23 Sep 2023 11:55:36 -0400
Subject: [PATCH 24/58] update dataset links

---
 docs/datasets.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/datasets.md b/docs/datasets.md
index fc4e0f292..6733736f4 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -1,6 +1,8 @@
 # Graphium Datasets
 
-Graphium datasets are hosted at on Zenodo on [this link](https://zenodo.org/record/8206704).
+Graphium datasets are hosted at on Zenodo 
+- ***ToyMix*** and  ***LargeMix*** dataseets are hosted on [this link](https://doi.org/10.5281/zenodo.7998401)
+- ***UltraLarge*** dataset is hosted on [this link](https://doi.org/10.5281/zenodo.8370547)
 
 Instead of provinding datasets as a single entity, our aim is to provide dataset mixes containing a variety of datasets that are meant to be predicted simultaneously using multi-tasking.
 

From b34cb9ee3289ae162565dea0a4ccc5c32690dcd5 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Mon, 25 Sep 2023 09:44:11 +0000
Subject: [PATCH 25/58] added configs with scaling factor sections

---
 .../SF_11M_config_LargeMix_mpnn_GPS++.yaml    | 476 ++++++++++++++++++
 .../SF_169M_config_LargeMix_mpnn_GPS++.yaml   | 476 ++++++++++++++++++
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   | 476 ++++++++++++++++++
 .../SF_42M_config_LargeMix_mpnn_GPS++.yaml    | 476 ++++++++++++++++++
 .../SF_671M_config_LargeMix_mpnn_GPS++.yaml   | 476 ++++++++++++++++++
 5 files changed, 2380 insertions(+)
 create mode 100644 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
 create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..a1778670e
--- /dev/null
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,476 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_11M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_11M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 256 # original 256
+  pre_nn_hidden_dims: 1024 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128
+  pre_nn_edges_hidden_dims: 512 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 128 # original 128
+  l1000_mcf7_hidden_dims: 128 # original 128
+  pcba_1328_hidden_dims: 64 # original 64
+  pcqm4m_g25_hidden_dims: 32 # original 32
+  pcqm4m_n4_hidden_dims: 32 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 2
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+      #  accelerator_kwargs:
+      #_accelerator: "ipu"
+    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    #gnn_layers_per_ipu: [4,4,4,4]
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..a645870be
--- /dev/null
+++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,476 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_169M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_169M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 512 # original 256
+  pre_nn_hidden_dims: 2048 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 256 # original 128
+  pre_nn_edges_hidden_dims: 1024 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 256 # original 128
+  l1000_mcf7_hidden_dims: 256 # original 128
+  pcba_1328_hidden_dims: 128 # original 64
+  pcqm4m_g25_hidden_dims: 64 # original 32
+  pcqm4m_n4_hidden_dims: 64 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 116
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 116
+        # Data handling-related
+        batch_size_training: 8
+        batch_size_inference: 8
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 16
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(2)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    #gnn_layers_per_ipu: [4,4,4,4]
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    # in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..5a5dbc203
--- /dev/null
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,476 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_378M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_378M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 768 # original 256
+  pre_nn_hidden_dims: 3072 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 384 # original 128
+  pre_nn_edges_hidden_dims: 1536 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 384 # original 128
+  l1000_mcf7_hidden_dims: 384 # original 128
+  pcba_1328_hidden_dims: 192 # original 64
+  pcqm4m_g25_hidden_dims: 96 # original 32
+  pcqm4m_n4_hidden_dims: 96 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 80 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 115
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 115
+        # Data handling-related
+        batch_size_training: 7
+        batch_size_inference: 7
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 64
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    #gnn_layers_per_ipu: [4,4,4,4]
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    #in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..a7b42ce53
--- /dev/null
+++ b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,476 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_42M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_42M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 256 # original 256
+  pre_nn_hidden_dims: 1024 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128
+  pre_nn_edges_hidden_dims: 512 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 128 # original 128
+  l1000_mcf7_hidden_dims: 128 # original 128
+  pcba_1328_hidden_dims: 64 # original 64
+  pcqm4m_g25_hidden_dims: 32 # original 32
+  pcqm4m_n4_hidden_dims: 32 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 16
+        batch_size_inference: 16
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 16
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(4)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    gnn_layers_per_ipu: [4,4,4,4]
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    # in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..b403e958e
--- /dev/null
+++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,476 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_173M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_173M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 1024 # original 256
+  pre_nn_hidden_dims: 4096 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128
+  pre_nn_edges_hidden_dims: 2048 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 512 # original 128
+  l1000_mcf7_hidden_dims: 512 # original 128
+  pcba_1328_hidden_dims: 256 # original 64
+  pcqm4m_g25_hidden_dims: 128 # original 32
+  pcqm4m_n4_hidden_dims: 128 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 116
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 116
+        # Data handling-related
+        batch_size_training: 8
+        batch_size_inference: 8
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 60
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(2)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+

From 011167251e66a5b45357b861c81aa13beaa028c1 Mon Sep 17 00:00:00 2001
From: Maciej Sypetkowski <maciej.sypetkowski@recursionpharma.com>
Date: Mon, 25 Sep 2023 04:10:38 -0600
Subject: [PATCH 26/58] Fix style

---
 graphium/nn/pyg_layers/gps_pyg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index b82fad782..555d8bef0 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -375,7 +375,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) ->
         )
 
         attn_bias = None
-        if self.biased_attention_key is not None and self.biased_attention_key != 'none':
+        if self.biased_attention_key is not None and self.biased_attention_key != "none":
             attn_bias = batch[self.biased_attention_key]
 
         # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim]

From dcf237ed3f9c71d8d061cc3f239eb85da48d8d03 Mon Sep 17 00:00:00 2001
From: Maciej Sypetkowski <maciej.sypetkowski@recursionpharma.com>
Date: Mon, 25 Sep 2023 04:16:28 -0600
Subject: [PATCH 27/58] Rename r2 -> r2_score in configs

---
 expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml
index 87136b683..1affb5042 100644
--- a/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml
+++ b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml
@@ -80,7 +80,7 @@ metrics:
       target_nan_mask: null
       multitask_handling: mean-per-label
     - name: r2_score
-      metric: r2
+      metric: r2_score
       target_nan_mask: null
       multitask_handling: mean-per-label
       threshold_kwargs: null
@@ -138,4 +138,4 @@ datamodule:
   args:
     # TDC specific
     tdc_benchmark_names: null
-    tdc_train_val_seed: ${constants.seed}
\ No newline at end of file
+    tdc_train_val_seed: ${constants.seed}

From a38ba090d12842d6851a3811aa74e3146ac71738 Mon Sep 17 00:00:00 2001
From: Maciej Sypetkowski <maciej.sypetkowski@recursionpharma.com>
Date: Mon, 25 Sep 2023 04:36:50 -0600
Subject: [PATCH 28/58] Make out_dim_edges to default to the last hidden edge
 dim

---
 .../nn/architectures/global_architectures.py  | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py
index 75dfa7b38..0eaee740f 100644
--- a/graphium/nn/architectures/global_architectures.py
+++ b/graphium/nn/architectures/global_architectures.py
@@ -422,7 +422,7 @@ def __init__(
         residual_skip_steps: int = 1,
         in_dim_edges: int = 0,
         hidden_dims_edges: List[int] = [],
-        out_dim_edges: int = 0,
+        out_dim_edges: Optional[int] = None,
         name: str = "GNN",
         layer_kwargs: Optional[Dict] = None,
         virtual_node: str = "none",
@@ -512,7 +512,8 @@ def __init__(
 
             out_dim_edges:
                 Output edge-feature dimensions of the network. Keep at 0 if not using
-                edge features, or if the layer doesn't support edges.
+                edge features, or if the layer doesn't support edges. Defaults to the
+                last value of hidden_dims_edges.
 
             name:
                 Name attributed to the current network, for display and printing
@@ -550,7 +551,6 @@ def __init__(
 
         # Initialize the additional attributes
         self.in_dim_edges = in_dim_edges
-        self.out_dim_edges = out_dim_edges
         if isinstance(hidden_dims_edges, int):
             self.hidden_dims_edges = [hidden_dims_edges] * (depth - 1)
         elif len(hidden_dims_edges) == 0:
@@ -558,10 +558,17 @@ def __init__(
         else:
             self.hidden_dims_edges = list(hidden_dims_edges)
             assert depth is None
+        self.out_dim_edges = (
+            out_dim_edges
+            if out_dim_edges is not None
+            else self.hidden_dims_edges[-1]
+            if self.hidden_dims_edges
+            else 0
+        )
         self.full_dims_edges = None
-        if len(self.hidden_dims_edges) or out_dim_edges > 0:
-            assert out_dim_edges > 0, out_dim_edges
-            self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [out_dim_edges]
+        if len(self.hidden_dims_edges) or self.out_dim_edges > 0:
+            assert self.out_dim_edges > 0, self.out_dim_edges
+            self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [self.out_dim_edges]
 
         self.virtual_node = virtual_node.lower() if virtual_node is not None else "none"
 

From a0db06d80d8792b0ab0ced23ace8ff3bbf1193e0 Mon Sep 17 00:00:00 2001
From: Maciej Sypetkowski <maciej.sypetkowski@recursionpharma.com>
Date: Mon, 25 Sep 2023 05:24:28 -0600
Subject: [PATCH 29/58] Fix artifact logging, log also unresolved config

---
 graphium/cli/train_finetune_test.py |  3 ++-
 graphium/config/_loader.py          | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index 885839ec0..55a27881c 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -57,6 +57,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
     The main (pre-)training and fine-tuning loop.
     """
 
+    unresolved_cfg = OmegaConf.to_container(cfg, resolve=False)
     cfg = OmegaConf.to_container(cfg, resolve=True)
 
     dst_dir = cfg["constants"].get("results_dir")
@@ -136,7 +137,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
             trainer.callbacks.append(GraphFinetuning(**finetuning_training_kwargs))
 
         if wandb_cfg is not None:
-            save_params_to_wandb(trainer.logger, cfg, predictor, datamodule)
+            save_params_to_wandb(trainer.logger, cfg, predictor, datamodule, unresolved_config=unresolved_cfg)
 
         # Determine the max num nodes and edges in training and validation
         logger.info("Computing the maximum number of nodes and edges per graph")
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 5a6754c54..ab8b72e05 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -445,6 +445,7 @@ def save_params_to_wandb(
     config: Union[omegaconf.DictConfig, Dict[str, Any]],
     predictor: PredictorModule,
     datamodule: MultitaskFromSmilesDataModule,
+    unresolved_config: Optional[Union[omegaconf.DictConfig, Dict[str, Any]]] = None,
 ):
     """
     Save a few stuff to weights-and-biases WandB
@@ -453,13 +454,16 @@ def save_params_to_wandb(
         config: The config file, with key `trainer`
         predictor: The predictor used to handle the train/val/test steps logic
         datamodule: The datamodule used to load the data into training
+        unresolved_config: The unresolved config file
     """
 
     # Get the wandb runner and directory
     wandb_run = logger.experiment
+
     if wandb_run is None:
-        wandb_run = ""
-    wandb_dir = wandb_run.dir
+        wandb_dir = ""
+    else:
+        wandb_dir = wandb_run.dir
 
     # Save the mup base model to WandB as a yaml file
     mup.save_base_shapes(predictor.model, os.path.join(wandb_dir, "mup_base_params.yaml"))
@@ -468,14 +472,18 @@ def save_params_to_wandb(
     with open(os.path.join(wandb_dir, "full_configs.yaml"), "w") as file:
         yaml.dump(config, file)
 
+    if unresolved_config is not None:
+        with open(os.path.join(wandb_dir, "unresolved_config.yaml"), "w") as file:
+            yaml.dump(unresolved_config, file)
+
     # Save the featurizer into wandb
     featurizer_path = os.path.join(wandb_dir, "featurizer.pickle")
     joblib.dump(datamodule.smiles_transformer, featurizer_path)
 
     # Save the featurizer and configs into wandb
     if wandb_run is not None:
-        wandb_run.save("*.yaml")
-        wandb_run.save("*.pickle")
+        wandb_run.save(os.path.join(wandb_dir, "*.yaml"), wandb_dir)
+        wandb_run.save(os.path.join(wandb_dir, "*.pickle"), wandb_dir)
 
 
 def load_accelerator(config: Union[omegaconf.DictConfig, Dict[str, Any]]) -> Tuple[Dict[str, Any], str]:

From 4d59a517e5447b953ea77a7457ed127b73da0efd Mon Sep 17 00:00:00 2001
From: wenkelf <frederik.wenkel@gmail.com>
Date: Mon, 25 Sep 2023 15:11:14 +0000
Subject: [PATCH 30/58] Few mup updates

---
 .gitignore                          | 1 +
 graphium/cli/train_finetune_test.py | 4 +++-
 graphium/config/_loader.py          | 3 +++
 graphium/nn/utils.py                | 2 +-
 scripts/scale_mpnn.sh               | 9 +++++++++
 5 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 scripts/scale_mpnn.sh

diff --git a/.gitignore b/.gitignore
index b9f39521e..c43a954c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ tests/temp_cache*
 predictions/
 draft/
 scripts-expts/
+mup/
 
 # Data and predictions
 graphium/data/ZINC_bench_gnn/
diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index 885839ec0..e7909a17f 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -142,9 +142,11 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
         logger.info("Computing the maximum number of nodes and edges per graph")
         predictor.set_max_nodes_edges_per_graph(datamodule, stages=["train", "val"])
 
+        ckpt_path = cfg["trainer"].pop("resume_from_checkpoint", None)
+
         # Run the model training
         with SafeRun(name="TRAINING", raise_error=cfg["constants"]["raise_train_error"], verbose=True):
-            trainer.fit(model=predictor, datamodule=datamodule)
+            trainer.fit(model=predictor, datamodule=datamodule, ckpt_path=ckpt_path)
 
         # Save validation metrics - Base utility in case someone doesn't use a logger.
         results = trainer.callback_metrics
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 5a6754c54..f54d5db55 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -327,6 +327,9 @@ def load_predictor(
             model_class=model_class,
             model_kwargs=scaled_model_kwargs,
             metrics=metrics,
+            task_levels=task_levels,
+            featurization=featurization,
+            task_norms=task_norms,
             **cfg_pred,
         )
 
diff --git a/graphium/nn/utils.py b/graphium/nn/utils.py
index 68a8779c4..e9ac4fa0c 100644
--- a/graphium/nn/utils.py
+++ b/graphium/nn/utils.py
@@ -40,7 +40,7 @@ def scale_kwargs(self, scale_factor: Real, scale_in_dim: bool = False):
 
         divide_factor = 1 / scale_factor
 
-        if scale_in_dim is None:
+        if not scale_in_dim:
             return self.make_mup_base_kwargs(divide_factor=divide_factor)
 
         # If scale_in_dim passed, need to check it can be forwarded
diff --git a/scripts/scale_mpnn.sh b/scripts/scale_mpnn.sh
new file mode 100644
index 000000000..8cd61fb86
--- /dev/null
+++ b/scripts/scale_mpnn.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+graphium-train \
+    --config-path=/home/frederik_valencediscovery_com/projects/graphium_hps/expts/configs/ \
+    --config-name=config_mpnn_base.yaml \
+    constants.max_epochs=100 \
+    trainer.model_checkpoint.dirpath=model_checkpoints/large-dataset/scale_mpnn/ \
+    +architecture.mup_scale_factor=2 +architecture.mup_base_path=mup/mpnn_base/base_shapes.yaml \
+    datamodule.args.batch_size_inference=1024 datamodule.args.batch_size_training=1024 +trainer.trainer.accumulate_grad_batches=2 \
\ No newline at end of file

From 1f869ccf239bce1e6d616b4fbce5ee026d804ca8 Mon Sep 17 00:00:00 2001
From: wenkelf <frederik.wenkel@gmail.com>
Date: Mon, 25 Sep 2023 20:20:47 +0000
Subject: [PATCH 31/58] Updating predictor

---
 graphium/trainer/predictor.py         | 1 +
 graphium/trainer/predictor_options.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index 6824a40df..19a5c7a29 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -221,6 +221,7 @@ def configure_optimizers(self, impl=None):
 
         # Define the optimizer and schedulers
         optimiser = MuAdam(self.parameters(), **self.optim_options.optim_kwargs, impl=impl)
+        self.optim_options.torch_scheduler_kwargs.pop("module_type")
         torch_scheduler = self.optim_options.scheduler_class(
             optimizer=optimiser, **self.optim_options.torch_scheduler_kwargs
         )
diff --git a/graphium/trainer/predictor_options.py b/graphium/trainer/predictor_options.py
index 0bab97674..3fbfb4e4d 100644
--- a/graphium/trainer/predictor_options.py
+++ b/graphium/trainer/predictor_options.py
@@ -99,7 +99,7 @@ def set_kwargs(self):
         self.torch_scheduler_kwargs.setdefault("module_type", "ReduceLROnPlateau")
 
         # Get the class for the scheduler
-        scheduler_class = self.torch_scheduler_kwargs.pop("module_type", None)
+        scheduler_class = self.torch_scheduler_kwargs.get("module_type", None)
         if self.scheduler_class is None:
             if isinstance(scheduler_class, str):
                 self.scheduler_class = SCHEDULER_DICT[scheduler_class]

From 870ea4017b299995aa1a3499776f190a3ed74b4a Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 29 Sep 2023 12:28:00 +0000
Subject: [PATCH 32/58] Adding a 671M config that compiles and should run ona 
 POD 16 system

---
 ..._config_LargeMix_mpnn_GPS++_compiling.yaml | 482 ++++++++++++++++++
 1 file changed, 482 insertions(+)
 create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml

diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
new file mode 100644
index 000000000..2d00c9768
--- /dev/null
+++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
@@ -0,0 +1,482 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/net/group/research/kerstink/neurips2023-large/"
+  epochs: 100
+  name: SF_671M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_671M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: &gnn_dim 1024 # original 256
+  pre_nn_hidden_dims: 4096 # original 1024
+  pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128
+  pre_nn_edges_hidden_dims: 2048 # original 512
+  gnn_out_dim: *gnn_dim # original 256
+  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+  mpnn_in_dim: *gnn_dim # original 256
+  mpnn_out_dim: *gnn_dim # original 256
+  mpnn_in_dim_edges: *gnn_dim_edges # original 128
+  mpnn_out_dim_edges: *gnn_dim_edges # original 128
+  graph_output_nn_out_dims: *gnn_dim # original 256
+  graph_output_nn_hidden_dims: *gnn_dim # original 256
+  node_output_nn_out_dims: *gnn_dim # original 256
+  node_output_nn_hidden_dims: *gnn_dim  # original 256
+  l1000_vcap_hidden_dims: 512 # original 128
+  l1000_mcf7_hidden_dims: 512 # original 128
+  pcba_1328_hidden_dims: 256 # original 64
+  pcqm4m_g25_hidden_dims: 128 # original 32
+  pcqm4m_n4_hidden_dims: 128 # original 32
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 128
+          max_num_edges: 272
+        ipu_dataloader_inference_opts:
+          mode: async
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 128
+          max_num_edges: 272
+        # Data handling-related
+        batch_size_training: 3
+        batch_size_inference: 2
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 320
+
+  ipu_config:
+    - deviceIterations(1) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    - 'setAvailableMemoryProportion({"IPU0": 0.1})'
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    # - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(30)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      max_num_nodes: 100
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+

From 8a9acd31d993ef76f6781ed43aec7f86e594023a Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 29 Sep 2023 12:32:00 +0000
Subject: [PATCH 33/58] nodes -> atoms

---
 .../SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
index 2d00c9768..01b0bf315 100644
--- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
+++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
@@ -181,7 +181,7 @@ datamodule:
       add_self_loop: False
       explicit_H: False # if H is included
       use_bonds_weights: False
-      max_num_nodes: 100
+      max_num_atoms: 100
       pos_encoding_as_features: # encoder dropout 0.18
         pos_types:
           lap_eigvec:

From 2c7f037bde32418b59b3384cf8149504608d94c7 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Wed, 4 Oct 2023 10:42:10 +0000
Subject: [PATCH 34/58] Config changes

---
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   | 31 +++----
 .../SF_671M_config_LargeMix_mpnn_GPS++.yaml   | 82 ++++++++++---------
 2 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
index 5a5dbc203..490da9370 100644
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
@@ -45,45 +45,45 @@ accelerator:
       args:
         ipu_dataloader_training_opts:
           mode: async
-          max_num_nodes_per_graph: 80 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 115
+          max_num_nodes_per_graph: 56 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 96
         ipu_dataloader_inference_opts:
           mode: async
-          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 115
+          max_num_nodes_per_graph: 56 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 96
         # Data handling-related
-        batch_size_training: 7
-        batch_size_inference: 7
+        batch_size_training: 4
+        batch_size_inference: 4
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 1024
+        loss_scaling: 16000
     trainer:
       trainer:
         precision: 16-true
-        accumulate_grad_batches: 64
+        accumulate_grad_batches: 240
 
   ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
     - replicationFactor(1)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
+    # - TensorLocations.numIOTiles(128)
+    # - _Popart.set("defaultBufferingDepth", 96)
     - Precision.enableStochasticRounding(True)
     # - Precision.enableFloatingPointExceptions(True)
 
   ipu_inference_config:
   # set device iteration and replication factor to 1 during inference
   # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
+    - deviceIterations(20)
     - replicationFactor(1)
     - Precision.enableStochasticRounding(False)
 
   accelerator_kwargs:
     _accelerator: "ipu"
     gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
     #gnn_layers_per_ipu: [4,4,4,4]
 
 datamodule:
@@ -176,6 +176,7 @@ datamodule:
       add_self_loop: False
       explicit_H: False # if H is included
       use_bonds_weights: False
+      max_num_atoms: 100
       pos_encoding_as_features: # encoder dropout 0.18
         pos_types:
           lap_eigvec:
@@ -195,8 +196,8 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
+    num_workers: 30 # -1 to use all
+    persistent_workers: True # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
 #Task-specific
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
index b403e958e..c6719001a 100644
--- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
@@ -5,12 +5,12 @@
 constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
+  datacache_path: "../../neurips2023-large/"
   epochs: 100
-  name: SF_173M_sweep_LargeMix_mpnn
+  name: SF_671M_sweep_LargeMix_mpnn
   wandb:
     entity: multitask-gnn
-    name: SF_173M_sweep_LargeMix_mpnn
+    name: SF_671M_sweep_LargeMix_mpnn
     project: neurips2023_graphcore_scaling_mpnn
 
     # This whole sections is for minimizing mistakes for the scaling experiments. 
@@ -18,25 +18,25 @@ constants:
     # No other dimensions have to be changed in the architecture part.    
 
 dimensions:
-  pre_nn_out_dim: &gnn_dim 1024 # original 256
-  pre_nn_hidden_dims: 4096 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128
-  pre_nn_edges_hidden_dims: 2048 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 512 # original 128
-  l1000_mcf7_hidden_dims: 512 # original 128
-  pcba_1328_hidden_dims: 256 # original 64
-  pcqm4m_g25_hidden_dims: 128 # original 32
-  pcqm4m_n4_hidden_dims: 128 # original 32
+  pre_nn_out_dim: 960
+  pre_nn_hidden_dims: 3840
+  pre_nn_edges_out_dim: 480
+  pre_nn_edges_hidden_dims: 1920
+  gnn_out_dim: 960
+  gnn_hidden_dims: 960
+  mpnn_in_dim: 960
+  mpnn_out_dim: 960
+  mpnn_in_dim_edges: 480
+  mpnn_out_dim_edges: 480
+  graph_output_nn_out_dims: 960
+  graph_output_nn_hidden_dims: 960
+  node_output_nn_out_dims: 960
+  node_output_nn_hidden_dims: 960
+  l1000_vcap_hidden_dims: 480
+  l1000_mcf7_hidden_dims: 480
+  pcba_1328_hidden_dims: 240
+  pcqm4m_g25_hidden_dims: 120
+  pcqm4m_n4_hidden_dims: 120
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -45,15 +45,19 @@ accelerator:
       args:
         ipu_dataloader_training_opts:
           mode: async
-          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 116
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
         ipu_dataloader_inference_opts:
           mode: async
-          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 116
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
         # Data handling-related
-        batch_size_training: 8
-        batch_size_inference: 8
+        batch_size_training: 3
+        batch_size_inference: 3
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
@@ -61,29 +65,30 @@ accelerator:
     trainer:
       trainer:
         precision: 16-true
-        accumulate_grad_batches: 60
+        accumulate_grad_batches: 320
 
   ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(2)
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
+    # - TensorLocations.numIOTiles(128)
+    # - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
     - Precision.enableStochasticRounding(True)
     # - Precision.enableFloatingPointExceptions(True)
-
   ipu_inference_config:
   # set device iteration and replication factor to 1 during inference
   # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
+    - deviceIterations(16)
     - replicationFactor(1)
     - Precision.enableStochasticRounding(False)
 
   accelerator_kwargs:
     _accelerator: "ipu"
-    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
     
 
 datamodule:
@@ -176,6 +181,7 @@ datamodule:
       add_self_loop: False
       explicit_H: False # if H is included
       use_bonds_weights: False
+      max_num_atoms: 100
       pos_encoding_as_features: # encoder dropout 0.18
         pos_types:
           lap_eigvec:
@@ -196,7 +202,7 @@ datamodule:
             ksteps: 16
 
     num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
+    persistent_workers: True # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
 #Task-specific

From 826996a5704359f53dc3ae8ffe19ca684e175d7f Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Wed, 4 Oct 2023 12:51:16 +0000
Subject: [PATCH 35/58] 378M debug config

---
 ...378M_config_LargeMix_mpnn_GPS++_debug.yaml | 481 +++++++++++++++++
 .../SF_590M_config_LargeMix_mpnn_GPS++.yaml   | 482 ++++++++++++++++++
 2 files changed, 963 insertions(+)
 create mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml
 create mode 100644 expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml

diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml
new file mode 100644
index 000000000..d7fe43e6e
--- /dev/null
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml
@@ -0,0 +1,481 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_378M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_378M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: 768
+  pre_nn_hidden_dims: 3072
+  pre_nn_edges_out_dim: 384
+  pre_nn_edges_hidden_dims: 1536
+  gnn_out_dim: 768
+  gnn_hidden_dims: 768
+  mpnn_in_dim: 768
+  mpnn_out_dim: 768
+  mpnn_in_dim_edges: 384
+  mpnn_out_dim_edges: 384
+  graph_output_nn_out_dims: 768
+  graph_output_nn_hidden_dims: 768
+  node_output_nn_out_dims: 768
+  node_output_nn_hidden_dims: 768
+  l1000_vcap_hidden_dims: 384
+  l1000_mcf7_hidden_dims: 384
+  pcba_1328_hidden_dims: 192
+  pcqm4m_g25_hidden_dims: 96
+  pcqm4m_n4_hidden_dims: 96
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
+        ipu_dataloader_inference_opts:
+          mode: async
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
+        # Data handling-related
+        batch_size_training: 3
+        batch_size_inference: 3
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 320
+
+  ipu_config:
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    # - TensorLocations.numIOTiles(128)
+    # - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(16)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      max_num_atoms: 100
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.        
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+
diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..c6719001a
--- /dev/null
+++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,482 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "../../neurips2023-large/"
+  epochs: 100
+  name: SF_671M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_671M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: 960
+  pre_nn_hidden_dims: 3840
+  pre_nn_edges_out_dim: 480
+  pre_nn_edges_hidden_dims: 1920
+  gnn_out_dim: 960
+  gnn_hidden_dims: 960
+  mpnn_in_dim: 960
+  mpnn_out_dim: 960
+  mpnn_in_dim_edges: 480
+  mpnn_out_dim_edges: 480
+  graph_output_nn_out_dims: 960
+  graph_output_nn_hidden_dims: 960
+  node_output_nn_out_dims: 960
+  node_output_nn_hidden_dims: 960
+  l1000_vcap_hidden_dims: 480
+  l1000_mcf7_hidden_dims: 480
+  pcba_1328_hidden_dims: 240
+  pcqm4m_g25_hidden_dims: 120
+  pcqm4m_n4_hidden_dims: 120
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
+        ipu_dataloader_inference_opts:
+          mode: async
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 290
+        # Data handling-related
+        batch_size_training: 3
+        batch_size_inference: 3
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 320
+
+  ipu_config:
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    # - TensorLocations.numIOTiles(128)
+    # - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(16)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      max_num_atoms: 100
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 32 # -1 to use all
+    persistent_workers: True # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+

From 428847f2a96e1f38b7a0de6edf160d25694170ca Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Thu, 5 Oct 2023 15:01:21 +0000
Subject: [PATCH 36/58] Simple solution to save checkpoints to wandb

---
 graphium/cli/train_finetune_test.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index ffb5a7512..298da8df6 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -161,6 +161,12 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
     logger.info("-" * 50)
 
     if wandb_cfg is not None:
+        # Save initial model state - and upload checkpoint to wandb
+        if cfg["trainer"]["model_checkpoint"]["save_last"] is True:
+            checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}_final_model.ckpt"
+            torch.save(predictor.model.state_dict(), checkpoint_path)
+            # Log the initial model checkpoint to wandb
+            wandb.save(checkpoint_path)
         wandb.finish()
 
     # Save test metrics - Base utility in case someone doesn't use a logger.

From a2670e51455737961bf6b0b937cf893f8b682084 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 6 Oct 2023 10:24:02 +0000
Subject: [PATCH 37/58] more explicit checking edges

---
 graphium/features/featurizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py
index 66f241663..0d917b9a6 100644
--- a/graphium/features/featurizer.py
+++ b/graphium/features/featurizer.py
@@ -1062,11 +1062,15 @@ def mol_to_graph_dict(
             mol = Chem.AddHs(mol)
         else:
             mol = Chem.RemoveHs(mol)
-
+        # SAMUELM: Temp fix
+        max_num_bonds = 265
         num_atoms = mol.GetNumAtoms()
+        num_bonds = mol.GetNumBonds()
         if (max_num_atoms is not None) and (num_atoms > max_num_atoms):
             raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
-
+        elif (max_num_atoms is not None) and (num_bonds > max_num_bonds):
+            raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
+        else:
         (
             adj,
             ndata,

From 473dd6f7b31469db55303db05eb837fffc73a0cc Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 6 Oct 2023 10:26:24 +0000
Subject: [PATCH 38/58] Dumb print

---
 .../foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 6 +++---
 graphium/config/_loader.py                                  | 2 +-
 graphium/features/featurizer.py                             | 2 ++
 graphium/finetuning/utils.py                                | 1 -
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
index a1778670e..7b151aa5e 100644
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -5,7 +5,7 @@
 constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
+  datacache_path: "/net/group/research/kerstink/neurips2023-large/"
   epochs: 100
   name: SF_11M_sweep_LargeMix_mpnn
   wandb:
@@ -231,8 +231,8 @@ predictor:
     # weight_decay: 1.e-7
   torch_scheduler_kwargs:
     module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
+    max_num_epochs: &max_epochs 10
+    warmup_epochs: 5
     verbose: False
   scheduler_kwargs:
   #  monitor: &monitor qm9/mae/train
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index da55a9266..0764492ff 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -421,7 +421,7 @@ def load_trainer(
         name = wandb_cfg.pop("name", "main")
         if len(date_time_suffix) > 0:
             name += f"_{date_time_suffix}"
-        trainer_kwargs["logger"] = WandbLogger(name=name, **wandb_cfg)
+        trainer_kwargs["logger"] = WandbLogger(name=name, log_model=True, **wandb_cfg)
 
     trainer_kwargs["callbacks"] = callbacks
     trainer = Trainer(
diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py
index 0d917b9a6..56e8bd036 100644
--- a/graphium/features/featurizer.py
+++ b/graphium/features/featurizer.py
@@ -1067,8 +1067,10 @@ def mol_to_graph_dict(
         num_atoms = mol.GetNumAtoms()
         num_bonds = mol.GetNumBonds()
         if (max_num_atoms is not None) and (num_atoms > max_num_atoms):
+            logger.info("removing based on atoms")
             raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
         elif (max_num_atoms is not None) and (num_bonds > max_num_bonds):
+            logger.info("removing based on edges")
             raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
         else:
         (
diff --git a/graphium/finetuning/utils.py b/graphium/finetuning/utils.py
index ede0f639c..4bb343180 100644
--- a/graphium/finetuning/utils.py
+++ b/graphium/finetuning/utils.py
@@ -45,7 +45,6 @@ def modify_cfg_for_finetuning(cfg: Dict[str, Any]):
     """
     Function combining information from configuration and pretrained model for finetuning.
     """
-
     task = cfg["finetuning"]["task"]
 
     # Filter the config based on the task name

From 5700c439b85afcdc78b83f5e0dc159142d273399 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Fri, 6 Oct 2023 15:49:07 +0000
Subject: [PATCH 39/58] Sweep Configs

---
 .../SF_11M_config_LargeMix_mpnn_GPS++.yaml    |   6 +-
 .../SF_169M_config_LargeMix_mpnn_GPS++.yaml   |   4 +-
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   |  76 +--
 ...378M_config_LargeMix_mpnn_GPS++_debug.yaml | 481 ------------------
 .../SF_42M_config_LargeMix_mpnn_GPS++.yaml    |   4 +-
 .../SF_590M_config_LargeMix_mpnn_GPS++.yaml   |  12 +-
 graphium/cli/train_finetune_test.py           |  10 +-
 graphium/features/featurizer.py               |   5 -
 8 files changed, 61 insertions(+), 537 deletions(-)
 delete mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
index 7b151aa5e..140bc6ad3 100644
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -231,8 +231,8 @@ predictor:
     # weight_decay: 1.e-7
   torch_scheduler_kwargs:
     module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 10
-    warmup_epochs: 5
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
     verbose: False
   scheduler_kwargs:
   #  monitor: &monitor qm9/mae/train
@@ -302,7 +302,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
index a645870be..12bdf9806 100644
--- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
@@ -195,7 +195,7 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 32 # -1 to use all
+    num_workers: 16 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
@@ -302,7 +302,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
index 490da9370..85668d003 100644
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
@@ -18,25 +18,25 @@ constants:
     # No other dimensions have to be changed in the architecture part.    
 
 dimensions:
-  pre_nn_out_dim: &gnn_dim 768 # original 256
-  pre_nn_hidden_dims: 3072 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 384 # original 128
-  pre_nn_edges_hidden_dims: 1536 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 384 # original 128
-  l1000_mcf7_hidden_dims: 384 # original 128
-  pcba_1328_hidden_dims: 192 # original 64
-  pcqm4m_g25_hidden_dims: 96 # original 32
-  pcqm4m_n4_hidden_dims: 96 # original 32
+  pre_nn_out_dim: 768
+  pre_nn_hidden_dims: 3072
+  pre_nn_edges_out_dim: 384
+  pre_nn_edges_hidden_dims: 1536
+  gnn_out_dim: 768
+  gnn_hidden_dims: 768
+  mpnn_in_dim: 768
+  mpnn_out_dim: 768
+  mpnn_in_dim_edges: 384
+  mpnn_out_dim_edges: 384
+  graph_output_nn_out_dims: 768
+  graph_output_nn_hidden_dims: 768
+  node_output_nn_out_dims: 768
+  node_output_nn_hidden_dims: 768
+  l1000_vcap_hidden_dims: 384
+  l1000_mcf7_hidden_dims: 384
+  pcba_1328_hidden_dims: 192
+  pcqm4m_g25_hidden_dims: 96
+  pcqm4m_n4_hidden_dims: 96
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -45,38 +45,43 @@ accelerator:
       args:
         ipu_dataloader_training_opts:
           mode: async
-          max_num_nodes_per_graph: 56 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 96
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 298
         ipu_dataloader_inference_opts:
           mode: async
-          max_num_nodes_per_graph: 56 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 96
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 298
         # Data handling-related
-        batch_size_training: 4
-        batch_size_inference: 4
+        batch_size_training: 3
+        batch_size_inference: 3
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 16000
+        loss_scaling: 1024
     trainer:
       trainer:
         precision: 16-true
-        accumulate_grad_batches: 240
+        accumulate_grad_batches: 320
 
   ipu_config:
     - deviceIterations(5) # IPU would require large batches to be ready for the model.
     - replicationFactor(1)
+    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
     # - TensorLocations.numIOTiles(128)
     # - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
     - Precision.enableStochasticRounding(True)
     # - Precision.enableFloatingPointExceptions(True)
-
   ipu_inference_config:
   # set device iteration and replication factor to 1 during inference
   # gradient accumulation was set to 1 in the code
-    - deviceIterations(20)
+    - deviceIterations(16)
     - replicationFactor(1)
     - Precision.enableStochasticRounding(False)
 
@@ -84,8 +89,7 @@ accelerator:
     _accelerator: "ipu"
     gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
     # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    #gnn_layers_per_ipu: [4,4,4,4]
-
+    
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
   # module_type: "FakeDataModule"  # Option to use generated data
@@ -196,9 +200,9 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 30 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
+    num_workers: 16 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.        
 
 #Task-specific
 predictor:
@@ -303,7 +307,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
@@ -367,7 +371,7 @@ architecture:
         first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
 
   gnn:  # Set as null to avoid a post-nn network
-    #in_dim: 256 # should be consistent with pre_nn.out_dim
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
     out_dim: ${dimensions.gnn_out_dim}
     hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
     depth: 16
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml
deleted file mode 100644
index d7fe43e6e..000000000
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml
+++ /dev/null
@@ -1,481 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_378M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_378M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: 768
-  pre_nn_hidden_dims: 3072
-  pre_nn_edges_out_dim: 384
-  pre_nn_edges_hidden_dims: 1536
-  gnn_out_dim: 768
-  gnn_hidden_dims: 768
-  mpnn_in_dim: 768
-  mpnn_out_dim: 768
-  mpnn_in_dim_edges: 384
-  mpnn_out_dim_edges: 384
-  graph_output_nn_out_dims: 768
-  graph_output_nn_hidden_dims: 768
-  node_output_nn_out_dims: 768
-  node_output_nn_hidden_dims: 768
-  l1000_vcap_hidden_dims: 384
-  l1000_mcf7_hidden_dims: 384
-  pcba_1328_hidden_dims: 192
-  pcqm4m_g25_hidden_dims: 96
-  pcqm4m_n4_hidden_dims: 96
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 290
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 290
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 3
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    # - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(16)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.        
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
index a7b42ce53..04c393733 100644
--- a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
@@ -195,7 +195,7 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 32 # -1 to use all
+    num_workers: 16 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
@@ -302,7 +302,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
index c6719001a..bf884762b 100644
--- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
@@ -7,10 +7,10 @@ constants:
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
   datacache_path: "../../neurips2023-large/"
   epochs: 100
-  name: SF_671M_sweep_LargeMix_mpnn
+  name: SF_590M_sweep_LargeMix_mpnn
   wandb:
     entity: multitask-gnn
-    name: SF_671M_sweep_LargeMix_mpnn
+    name: SF_590M_sweep_LargeMix_mpnn
     project: neurips2023_graphcore_scaling_mpnn
 
     # This whole sections is for minimizing mistakes for the scaling experiments. 
@@ -48,13 +48,13 @@ accelerator:
           # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
           # max_num_edges_per_graph: 116
           max_num_nodes: 140
-          max_num_edges: 290
+          max_num_edges: 298
         ipu_dataloader_inference_opts:
           mode: async
           # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
           # max_num_edges_per_graph: 116
           max_num_nodes: 140
-          max_num_edges: 290
+          max_num_edges: 298
         # Data handling-related
         batch_size_training: 3
         batch_size_inference: 3
@@ -201,7 +201,7 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 32 # -1 to use all
+    num_workers: 16 # -1 to use all
     persistent_workers: True # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
@@ -308,7 +308,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index 298da8df6..e91fd7681 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -56,6 +56,13 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
 
     cfg = OmegaConf.to_container(cfg, resolve=True)
 
+    # Get the current date and time
+    now = datetime.now()
+    # Format the datetime as a string
+    filename_datetime_suffix = now.strftime("%Y%m%d_%H%M%S")
+    # Append the datetime string to the existing filename in the cfg dictionary
+    cfg['trainer']['model_checkpoint']['filename'] += f"_{filename_datetime_suffix}"
+
     dst_dir = cfg["constants"].get("results_dir")
     hydra_cfg = HydraConfig.get()
     output_dir = hydra_cfg["runtime"]["output_dir"]
@@ -163,8 +170,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
     if wandb_cfg is not None:
         # Save initial model state - and upload checkpoint to wandb
         if cfg["trainer"]["model_checkpoint"]["save_last"] is True:
-            checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}_final_model.ckpt"
-            torch.save(predictor.model.state_dict(), checkpoint_path)
+            checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}.ckpt"
             # Log the initial model checkpoint to wandb
             wandb.save(checkpoint_path)
         wandb.finish()
diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py
index 56e8bd036..cb19e3b4b 100644
--- a/graphium/features/featurizer.py
+++ b/graphium/features/featurizer.py
@@ -1067,12 +1067,7 @@ def mol_to_graph_dict(
         num_atoms = mol.GetNumAtoms()
         num_bonds = mol.GetNumBonds()
         if (max_num_atoms is not None) and (num_atoms > max_num_atoms):
-            logger.info("removing based on atoms")
             raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
-        elif (max_num_atoms is not None) and (num_bonds > max_num_bonds):
-            logger.info("removing based on edges")
-            raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
-        else:
         (
             adj,
             ndata,

From cee7adb4cd16d7e9956f190dd211fcc4e66bcf5a Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 15:35:52 +0000
Subject: [PATCH 40/58] Adding the edge residual, making node residual more
 explicit, and adding activation scaling

---
 graphium/config/_loader.py        |  2 +-
 graphium/nn/pyg_layers/gps_pyg.py | 58 +++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 0c9b42aa5..9861f7436 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -76,7 +76,6 @@ def _get_ipu_opts(config: Union[omegaconf.DictConfig, Dict[str, Any]]) -> Tuple[
 
     if accelerator_type != "ipu":
         return None, None
-
     ipu_opts = accelerator_options["ipu_config"]
     ipu_inference_opts = accelerator_options.get("ipu_inference_config", None)
 
@@ -126,6 +125,7 @@ def load_datamodule(
             ipu_inference_opts=ipu_inference_opts,
             precision=config["trainer"]["trainer"].get("precision"),
         )
+
         # Define the Dataloader options for the IPU on the training sets
         bz_train = cfg_data["batch_size_training"]
         ipu_dataloader_training_opts = IPUDataloaderOptions(
diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index 555d8bef0..5b00c3d07 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -47,9 +47,10 @@ def __init__(
         activation: Union[Callable, str] = "relu",
         dropout: float = 0.0,
         node_residual: Optional[bool] = True,
+        edge_residual: Optional[bool] = True,
         normalization: Union[str, Callable] = "none",
         mpnn_type: str = "pyg:gine",
-        mpnn_kwargs=None,
+        mpnn_kwargs: Optional[dict] = None,
         attn_type: str = "full-attention",
         precision: str = "32",
         biased_attention_key: Optional[str] = None,
@@ -57,6 +58,7 @@ def __init__(
         droppath_rate_attn: float = 0.0,
         droppath_rate_ffn: float = 0.0,
         hidden_dim_scaling: float = 4.0,
+        output_scale: float = 1.0,
         **kwargs,
     ):
         r"""
@@ -99,6 +101,9 @@ def __init__(
             node_residual:
                 If node residual is used after on the gnn layer output
 
+            edge_residual:
+                If edge residual is used after on the gnn layer output
+
             normalization:
                 Normalization to use. Choices:
 
@@ -140,6 +145,10 @@ def __init__(
 
             attn_kwargs:
                 Keyword arguments to pass to the attention layer
+            
+            output_scale:
+                Float value that will be used to scale the activations, helps reudce growth of activations
+                as the model gets deeper. Default value of 1.0 leaves the layer unchanged. 
 
         """
 
@@ -165,6 +174,7 @@ def __init__(
 
         # Residual connections
         self.node_residual = node_residual
+        self.edge_residual = edge_residual
 
         self.precision = precision
 
@@ -189,6 +199,38 @@ def __init__(
         # Initialize the MPNN and Attention layers
         self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs)
         self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
+        
+        self.output_scale = torch.tensor(output_scale)
+        self.use_edges = self.mpnn.use_edges
+
+
+    def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
+        r"""
+        Residual additition layer. Allows information to propagate through the model
+        by skipping the computational layers. 
+        Parameters:
+            feature: The feature (typically nodes or edges) after message passing
+            input_feature: The same feature from before message passing
+        Returns:
+            The addition of the two tensors.
+        """
+        feature += input_feature
+        return feature
+    
+    def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor:
+        """Scale Activations by a constant factor to stop growth of activation scale
+        and reduce numerical stability issues at low precision
+
+        Args:
+            feature (Tensor): The feature to scale
+            scale_factor (float): The floating point scale factor 
+
+        Returns:
+            Tensor: The scaled features
+        """
+        feature *= scale_factor.to(dtype=feature.dtype)
+        return feature
+        
 
     def forward(self, batch: Batch) -> Batch:
         r"""
@@ -200,6 +242,8 @@ def forward(self, batch: Batch) -> Batch:
         """
         # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat
         feat = batch.feat
+        # TODO: samuelm - check if edges are being used here
+        edges_feat_in = batch.edge_feat
 
         feat_in = feat  # for first residual connection
 
@@ -208,10 +252,20 @@ def forward(self, batch: Batch) -> Batch:
         if self.mpnn is not None:
             batch_out = self.mpnn(batch_out)
         h_local = batch_out.feat
+        e_local = batch_out.edge_feat
         if self.dropout_local is not None:
             h_local = self.dropout_local(h_local)
+        # Apply the residual connection for the node features 
         if self.node_residual:
-            h_local = feat_in + h_local  # Residual connection for nodes, not used in gps++.
+            h_local = self.residual_add(h_local, feat_in)
+        # Scale the activations by some value to help reduce activation growth
+        h_local = self.scale_activations(h_local, self.output_scale)
+        # Apply the residual connection for the edge features 
+        if self.edge_residual:
+            e_local = self.residual_add(e_local, edges_feat_in)
+        # Scale the activations by some value to help reduce activation growth
+        e_local = self.scale_activations(e_local, self.output_scale) 
+            
         if self.norm_layer_local is not None:
             h_local = self.norm_layer_local(h_local)
 

From 19df1c3e9950c3164e643c5aafb12deecf06d4c3 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 15:42:16 +0000
Subject: [PATCH 41/58] linting

---
 graphium/nn/pyg_layers/gps_pyg.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index 5b00c3d07..f232d2954 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -145,10 +145,10 @@ def __init__(
 
             attn_kwargs:
                 Keyword arguments to pass to the attention layer
-            
+
             output_scale:
                 Float value that will be used to scale the activations, helps reudce growth of activations
-                as the model gets deeper. Default value of 1.0 leaves the layer unchanged. 
+                as the model gets deeper. Default value of 1.0 leaves the layer unchanged.
 
         """
 
@@ -199,15 +199,14 @@ def __init__(
         # Initialize the MPNN and Attention layers
         self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs)
         self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
-        
+
         self.output_scale = torch.tensor(output_scale)
         self.use_edges = self.mpnn.use_edges
 
-
     def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
         r"""
         Residual additition layer. Allows information to propagate through the model
-        by skipping the computational layers. 
+        by skipping the computational layers.
         Parameters:
             feature: The feature (typically nodes or edges) after message passing
             input_feature: The same feature from before message passing
@@ -216,21 +215,20 @@ def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
         """
         feature += input_feature
         return feature
-    
+
     def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor:
         """Scale Activations by a constant factor to stop growth of activation scale
         and reduce numerical stability issues at low precision
 
         Args:
             feature (Tensor): The feature to scale
-            scale_factor (float): The floating point scale factor 
+            scale_factor (float): The floating point scale factor
 
         Returns:
             Tensor: The scaled features
         """
         feature *= scale_factor.to(dtype=feature.dtype)
         return feature
-        
 
     def forward(self, batch: Batch) -> Batch:
         r"""
@@ -255,17 +253,17 @@ def forward(self, batch: Batch) -> Batch:
         e_local = batch_out.edge_feat
         if self.dropout_local is not None:
             h_local = self.dropout_local(h_local)
-        # Apply the residual connection for the node features 
+        # Apply the residual connection for the node features
         if self.node_residual:
             h_local = self.residual_add(h_local, feat_in)
         # Scale the activations by some value to help reduce activation growth
         h_local = self.scale_activations(h_local, self.output_scale)
-        # Apply the residual connection for the edge features 
+        # Apply the residual connection for the edge features
         if self.edge_residual:
             e_local = self.residual_add(e_local, edges_feat_in)
         # Scale the activations by some value to help reduce activation growth
-        e_local = self.scale_activations(e_local, self.output_scale) 
-            
+        e_local = self.scale_activations(e_local, self.output_scale)
+
         if self.norm_layer_local is not None:
             h_local = self.norm_layer_local(h_local)
 

From f6274b016a66af3e0920c5b12979786a94166697 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 16:20:39 +0000
Subject: [PATCH 42/58] Fixing use_edges

---
 graphium/nn/pyg_layers/gps_pyg.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index f232d2954..d96c4ee1c 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -201,7 +201,7 @@ def __init__(
         self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
 
         self.output_scale = torch.tensor(output_scale)
-        self.use_edges = self.mpnn.use_edges
+        self.use_edges = True if self.in_dim_edges is not None else False
 
     def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
         r"""
@@ -241,7 +241,8 @@ def forward(self, batch: Batch) -> Batch:
         # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat
         feat = batch.feat
         # TODO: samuelm - check if edges are being used here
-        edges_feat_in = batch.edge_feat
+        if self.use_edges:
+            edges_feat_in = batch.edge_feat
 
         feat_in = feat  # for first residual connection
 
@@ -259,10 +260,11 @@ def forward(self, batch: Batch) -> Batch:
         # Scale the activations by some value to help reduce activation growth
         h_local = self.scale_activations(h_local, self.output_scale)
         # Apply the residual connection for the edge features
-        if self.edge_residual:
+        if self.edge_residual and self.use_edges:
             e_local = self.residual_add(e_local, edges_feat_in)
         # Scale the activations by some value to help reduce activation growth
-        e_local = self.scale_activations(e_local, self.output_scale)
+        if self.use_edges:
+            e_local = self.scale_activations(e_local, self.output_scale)
 
         if self.norm_layer_local is not None:
             h_local = self.norm_layer_local(h_local)

From 3ac163ce756949c6b45ee7a63979e08afd867a0c Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 16:22:55 +0000
Subject: [PATCH 43/58] remove todo

---
 graphium/nn/pyg_layers/gps_pyg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index d96c4ee1c..530f42d32 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -240,7 +240,6 @@ def forward(self, batch: Batch) -> Batch:
         """
         # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat
         feat = batch.feat
-        # TODO: samuelm - check if edges are being used here
         if self.use_edges:
             edges_feat_in = batch.edge_feat
 

From 400fa98d54b3e61d8e684013eb82db73ba0d9c9e Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Tue, 10 Oct 2023 01:52:34 +0900
Subject: [PATCH 44/58] Fix typo in README.md

permamently -> permanently
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 11b707bba..a83f7ab40 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ To change parameters specific to this experiment like switching from `fp16` to `
 ```bash
 graphium-train dataset=toymix model=gcn trainer.trainer.precision=32
 ```
-or change them permamently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`.
+or change them permanently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`.
 Integrating `hydra` also allows you to quickly switch between accelerators. E.g., running
 ```bash
 graphium-train dataset=toymix model=gcn accelerator=gpu

From eef2f0f81b1a0f446c57a10468b42789502c174d Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 18:15:39 +0000
Subject: [PATCH 45/58] Device requirement for IPU training

---
 graphium/nn/pyg_layers/gps_pyg.py | 64 +++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index f3da56979..2a649373b 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -47,9 +47,10 @@ def __init__(
         activation: Union[Callable, str] = "relu",
         dropout: float = 0.0,
         node_residual: Optional[bool] = True,
+        edge_residual: Optional[bool] = True,
         normalization: Union[str, Callable] = "none",
         mpnn_type: str = "pyg:gine",
-        mpnn_kwargs=None,
+        mpnn_kwargs: Optional[dict] = None,
         attn_type: str = "full-attention",
         precision: str = "32",
         biased_attention_key: Optional[str] = None,
@@ -57,6 +58,7 @@ def __init__(
         droppath_rate_attn: float = 0.0,
         droppath_rate_ffn: float = 0.0,
         hidden_dim_scaling: float = 4.0,
+        output_scale: float = 1.0,
         **kwargs,
     ):
         r"""
@@ -99,6 +101,9 @@ def __init__(
             node_residual:
                 If node residual is used after on the gnn layer output
 
+            edge_residual:
+                If edge residual is used after on the gnn layer output
+
             normalization:
                 Normalization to use. Choices:
 
@@ -141,6 +146,10 @@ def __init__(
             attn_kwargs:
                 Keyword arguments to pass to the attention layer
 
+            output_scale:
+                Float value that will be used to scale the activations, helps reudce growth of activations
+                as the model gets deeper. Default value of 1.0 leaves the layer unchanged.
+
         """
 
         super().__init__(
@@ -165,6 +174,7 @@ def __init__(
 
         # Residual connections
         self.node_residual = node_residual
+        self.edge_residual = edge_residual
 
         self.precision = precision
 
@@ -190,6 +200,37 @@ def __init__(
         self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs)
         self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
 
+        self.output_scale = output_scale
+        self.use_edges = True if self.in_dim_edges is not None else False
+
+    def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
+        r"""
+        Residual additition layer. Allows information to propagate through the model
+        by skipping the computational layers.
+        Parameters:
+            feature: The feature (typically nodes or edges) after message passing
+            input_feature: The same feature from before message passing
+        Returns:
+            The addition of the two tensors.
+        """
+        feature += input_feature
+        return feature
+
+    def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor:
+        """Scale Activations by a constant factor to stop growth of activation scale
+        and reduce numerical stability issues at low precision
+
+        Args:
+            feature (Tensor): The feature to scale
+            scale_factor (float): The floating point scale factor
+
+        Returns:
+            Tensor: The scaled features
+        """
+        scale_factor = torch.tensor(scale_factor).to(feature.device)
+        feature *= scale_factor.to(dtype=feature.dtype)
+        return feature
+
     def forward(self, batch: Batch) -> Batch:
         r"""
         forward function of the layer
@@ -200,6 +241,8 @@ def forward(self, batch: Batch) -> Batch:
         """
         # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat
         feat = batch.feat
+        if self.use_edges:
+            edges_feat_in = batch.edge_feat
 
         feat_in = feat  # for first residual connection
 
@@ -208,10 +251,21 @@ def forward(self, batch: Batch) -> Batch:
         if self.mpnn is not None:
             batch_out = self.mpnn(batch_out)
         h_local = batch_out.feat
+        e_local = batch_out.edge_feat
         if self.dropout_local is not None:
             h_local = self.dropout_local(h_local)
+        # Apply the residual connection for the node features
         if self.node_residual:
-            h_local = feat_in + h_local  # Residual connection for nodes, not used in gps++.
+            h_local = self.residual_add(h_local, feat_in)
+        # Scale the activations by some value to help reduce activation growth
+        h_local = self.scale_activations(h_local, self.output_scale)
+        # Apply the residual connection for the edge features
+        if self.edge_residual and self.use_edges:
+            e_local = self.residual_add(e_local, edges_feat_in)
+        # Scale the activations by some value to help reduce activation growth
+        if self.use_edges:
+            e_local = self.scale_activations(e_local, self.output_scale)
+
         if self.norm_layer_local is not None:
             h_local = self.norm_layer_local(h_local)
 
@@ -240,7 +294,7 @@ def forward(self, batch: Batch) -> Batch:
     def _parse_mpnn_layer(self, mpnn_type, mpnn_kwargs: Dict[str, Any]) -> Optional[Module]:
         """Parse the MPNN layer."""
 
-        if mpnn_type is None:
+        if mpnn_type is None or mpnn_type == "none":
             return
 
         mpnn_kwargs = deepcopy(mpnn_kwargs)
@@ -375,7 +429,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) ->
         )
 
         attn_bias = None
-        if self.biased_attention_key is not None:
+        if self.biased_attention_key is not None and self.biased_attention_key != "none":
             attn_bias = batch[self.biased_attention_key]
 
         # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim]
@@ -463,6 +517,8 @@ def layer_outputs_edges(self) -> bool:
             bool:
                 Always ``False`` for the current class
         """
+        if self.mpnn is None:
+            return False
         return self.mpnn.layer_outputs_edges
 
     @property

From 7154c0afc494b4e6d87995a1d8e8cdf6fb40ce76 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 18:17:56 +0000
Subject: [PATCH 46/58] Config changes

---
 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
index 140bc6ad3..251bdae15 100644
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -380,7 +380,9 @@ architecture:
     use_virtual_edges: true
     layer_type: 'pyg:gps'
     layer_kwargs:
-      node_residual: false
+      node_residual: True
+      edge_residual: True 
+      output_scale: 1.0
       mpnn_type: 'pyg:mpnnplus'
       mpnn_kwargs:
         in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)

From 1133ea50f0d9f9a465b6d5dfd7c73ac6cbbc8e36 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 18:18:47 +0000
Subject: [PATCH 47/58] Device changes

---
 graphium/nn/pyg_layers/gps_pyg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index 530f42d32..2a649373b 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -200,7 +200,7 @@ def __init__(
         self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs)
         self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
 
-        self.output_scale = torch.tensor(output_scale)
+        self.output_scale = output_scale
         self.use_edges = True if self.in_dim_edges is not None else False
 
     def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor:
@@ -227,6 +227,7 @@ def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor:
         Returns:
             Tensor: The scaled features
         """
+        scale_factor = torch.tensor(scale_factor).to(feature.device)
         feature *= scale_factor.to(dtype=feature.dtype)
         return feature
 

From a66c546ae4e2ec633b3bbc40259bb76af820fd5c Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Mon, 9 Oct 2023 18:19:50 +0000
Subject: [PATCH 48/58] typo - review resolved

---
 graphium/nn/pyg_layers/gps_pyg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index 2a649373b..35adb27d8 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -147,7 +147,7 @@ def __init__(
                 Keyword arguments to pass to the attention layer
 
             output_scale:
-                Float value that will be used to scale the activations, helps reudce growth of activations
+                Float value that will be used to scale the activations, helps reduce growth of activations
                 as the model gets deeper. Default value of 1.0 leaves the layer unchanged.
 
         """

From 16100a7990dc4cb156a282956a792995b124321e Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Tue, 10 Oct 2023 09:51:38 +0000
Subject: [PATCH 49/58] Node + edge residual example

---
 .../foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
index 12bdf9806..02b99e0bf 100644
--- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
@@ -380,8 +380,10 @@ architecture:
     use_virtual_edges: true
     layer_type: 'pyg:gps'
     layer_kwargs:
-      node_residual: false
       mpnn_type: 'pyg:mpnnplus'
+      node_residual: True
+      edge_residual: True 
+      output_scale: 1.0
       mpnn_kwargs:
         in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
         out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)

From b949e700b66318c56008ea6c567854ee5d497303 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Tue, 10 Oct 2023 10:20:53 +0000
Subject: [PATCH 50/58] cleaned up configs

---
 .../100M_config_LargeMix_mpnn_GPS++.yaml      | 462 -----------------
 .../200M_config_LargeMix_mpnn_GPS++.yaml      | 462 -----------------
 .../400M_config_LargeMix_mpnn_GPS++.yaml      | 463 -----------------
 .../50M_config_LargeMix_mpnn_GPS++.yaml       | 462 -----------------
 .../SF_11M_config_LargeMix_mpnn_GPS++.yaml    |   6 +-
 .../SF_169M_config_LargeMix_mpnn_GPS++.yaml   |   4 +-
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   |   2 +-
 .../SF_590M_config_LargeMix_mpnn_GPS++.yaml   |   2 +-
 .../SF_671M_config_LargeMix_mpnn_GPS++.yaml   | 482 ------------------
 ..._config_LargeMix_mpnn_GPS++_compiling.yaml | 482 ------------------
 10 files changed, 7 insertions(+), 2820 deletions(-)
 delete mode 100644 expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml

diff --git a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index c8eefd4d2..000000000
--- a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,462 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: LargeMix_mpnn_100M
-  wandb:
-    entity: multitask-gnn
-    name: LargeMix_mpnn_100M
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 50 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 55 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 12
-        batch_size_inference: 12
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 12
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4,4,4,4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: &gnn_dim 428
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: *gnn_dim
-    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-      #  task_heads:
-      #    homolumo:
-      #      task_level: graph
-      #      out_dim: 1
-      #      hidden_dims: 256
-      #      depth: 2
-      #      activation: relu
-      #      last_activation: none
-      #      dropout: *dropout
-      #      normalization: *normalization
-      #      last_normalization: "none"
-      #      residual_type: none
diff --git a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 298d68109..000000000
--- a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,462 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: LargeMix_mpnn_200M
-  wandb:
-    entity: multitask-gnn
-    name: LargeMix_mpnn_200M
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 116
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 116
-        # Data handling-related
-        batch_size_training: 8
-        batch_size_inference: 8
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 16
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(2)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: &gnn_dim 628
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: *gnn_dim
-    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-      #  task_heads:
-      #    homolumo:
-      #      task_level: graph
-      #      out_dim: 1
-      #      hidden_dims: 256
-      #      depth: 2
-      #      activation: relu
-      #      last_activation: none
-      #      dropout: *dropout
-      #      normalization: *normalization
-      #      last_normalization: "none"
-      #      residual_type: none
diff --git a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 08820d330..000000000
--- a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,463 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: LargeMix_mpnn_400M
-  wandb:
-    entity: multitask-gnn
-    name: LargeMix_mpnn_400M
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 70 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 150
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 70 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 150
-        # Data handling-related
-        batch_size_training: 4
-        batch_size_inference: 4
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 32
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]      
-    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: &gnn_dim 910
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: *gnn_dim
-    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-      #  task_heads:
-      #    homolumo:
-      #      task_level: graph
-      #      out_dim: 1
-      #      hidden_dims: 256
-      #      depth: 2
-      #      activation: relu
-      #      last_activation: none
-      #      dropout: *dropout
-      #      normalization: *normalization
-      #      last_normalization: "none"
-      #      residual_type: none
diff --git a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index b75d7d2e0..000000000
--- a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,462 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: LargeMix_mpnn_50M
-  wandb:
-    entity: multitask-gnn
-    name: LargeMix_mpnn_50M
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 16
-        batch_size_inference: 16
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 16
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4,4,4,4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: &gnn_dim 282
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: *gnn_dim
-    hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-      #  task_heads:
-      #    homolumo:
-      #      task_level: graph
-      #      out_dim: 1
-      #      hidden_dims: 256
-      #      depth: 2
-      #      activation: relu
-      #      last_activation: none
-      #      dropout: *dropout
-      #      normalization: *normalization
-      #      last_normalization: "none"
-      #      residual_type: none
diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
index 140bc6ad3..3580c5c35 100644
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -5,7 +5,7 @@
 constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/net/group/research/kerstink/neurips2023-large/"
+  datacache_path: "/localdata/neurips2023-large/"
   epochs: 100
   name: SF_11M_sweep_LargeMix_mpnn
   wandb:
@@ -195,7 +195,7 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    num_workers: 32 # -1 to use all
+    num_workers: 16 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
 
@@ -302,7 +302,7 @@ trainer:
     name: ${constants.name}
     project: ${constants.name}
   model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
+    dirpath: ./models_checkpoints/${constants.name}/
     filename: ${constants.name}
     # monitor: *monitor
     # mode: *mode
diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
index 12bdf9806..9d7aad096 100644
--- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
@@ -57,7 +57,7 @@ accelerator:
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 1024
+        loss_scaling: 8192
     trainer:
       trainer:
         precision: 16-true
@@ -79,7 +79,7 @@ accelerator:
     - deviceIterations(30)
     - replicationFactor(1)
     - Precision.enableStochasticRounding(False)
-
+      
   accelerator_kwargs:
     _accelerator: "ipu"
     #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
index 85668d003..6dba7e4ad 100644
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
@@ -61,7 +61,7 @@ accelerator:
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 1024
+        loss_scaling: 1
     trainer:
       trainer:
         precision: 16-true
diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
index bf884762b..5e537c039 100644
--- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
@@ -5,7 +5,7 @@
 constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "../../neurips2023-large/"
+  datacache_path: "/localdata/neurips2023-large/"
   epochs: 100
   name: SF_590M_sweep_LargeMix_mpnn
   wandb:
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index c6719001a..000000000
--- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,482 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "../../neurips2023-large/"
-  epochs: 100
-  name: SF_671M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_671M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: 960
-  pre_nn_hidden_dims: 3840
-  pre_nn_edges_out_dim: 480
-  pre_nn_edges_hidden_dims: 1920
-  gnn_out_dim: 960
-  gnn_hidden_dims: 960
-  mpnn_in_dim: 960
-  mpnn_out_dim: 960
-  mpnn_in_dim_edges: 480
-  mpnn_out_dim_edges: 480
-  graph_output_nn_out_dims: 960
-  graph_output_nn_hidden_dims: 960
-  node_output_nn_out_dims: 960
-  node_output_nn_hidden_dims: 960
-  l1000_vcap_hidden_dims: 480
-  l1000_mcf7_hidden_dims: 480
-  pcba_1328_hidden_dims: 240
-  pcqm4m_g25_hidden_dims: 120
-  pcqm4m_n4_hidden_dims: 120
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 290
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 290
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 3
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    # - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(16)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
deleted file mode 100644
index 01b0bf315..000000000
--- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml
+++ /dev/null
@@ -1,482 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/net/group/research/kerstink/neurips2023-large/"
-  epochs: 100
-  name: SF_671M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_671M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: &gnn_dim 1024 # original 256
-  pre_nn_hidden_dims: 4096 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128
-  pre_nn_edges_hidden_dims: 2048 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 512 # original 128
-  l1000_mcf7_hidden_dims: 512 # original 128
-  pcba_1328_hidden_dims: 256 # original 64
-  pcqm4m_g25_hidden_dims: 128 # original 32
-  pcqm4m_n4_hidden_dims: 128 # original 32
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 128
-          max_num_edges: 272
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 128
-          max_num_edges: 272
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 2
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(1) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    - 'setAvailableMemoryProportion({"IPU0": 0.1})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-

From ba48d3e40429f910e545a816de61d895a555a88f Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Wed, 11 Oct 2023 10:10:32 +0000
Subject: [PATCH 51/58] Simple attempt at logging epochs

---
 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 3 ++-
 graphium/trainer/predictor.py                                 | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
index 251bdae15..ad803ffa2 100644
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
@@ -6,6 +6,7 @@ constants:
   seed: 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
   datacache_path: "/net/group/research/kerstink/neurips2023-large/"
+  # datacache_path: "/localdata/neurips2023-large/"
   epochs: 100
   name: SF_11M_sweep_LargeMix_mpnn
   wandb:
@@ -64,7 +65,7 @@ accelerator:
         accumulate_grad_batches: 2
 
   ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - deviceIterations(16) # IPU would require large batches to be ready for the model.
     - replicationFactor(16)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index c4e700895..6f8939e8a 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -461,6 +461,9 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None:
         # Get the metrics that are logged at every step (loss, grad_norm, batch_time, batch_tput)
         concatenated_metrics_logs = {}
         concatenated_metrics_logs["train/loss"] = outputs["loss"]
+        concatenated_metrics_logs["epoch_count"] = self.current_epoch
+        # TODO: Samuelm - we need a number of samples here as well if this works?
+        # import ipdb; ipdb.set_trace()
 
         # report the training loss for each individual tasks
         for task in self.tasks:

From d3d1ff3cac93be78ef1d2a2ed667686987b719b0 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Thu, 12 Oct 2023 15:23:52 +0000
Subject: [PATCH 52/58] Samples seen

---
 graphium/cli/train_finetune_test.py | 51 +++++++++++++++++++++++++++++
 graphium/config/_loader.py          |  9 +++++
 graphium/trainer/predictor.py       | 17 ++++++++--
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index e91fd7681..044bba3e6 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -48,6 +48,48 @@ def cli(cfg: DictConfig) -> None:
     """
     return run_training_finetuning_testing(cfg)
 
+def get_replication_factor(cfg):
+    try:
+        ipu_config = cfg.get("accelerator", {}).get("ipu_config", [])
+        for item in ipu_config:
+            if "replicationFactor" in item:
+                # Extract the number between parentheses
+                start = item.find("(") + 1
+                end = item.find(")")
+                if start != 0 and end != -1:
+                    return int(item[start:end])
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    
+    # Return default value if replicationFactor is not found or an error occurred
+    return 1
+
+def get_gradient_accumulation_factor(cfg):
+    try:
+        # Navigate through the nested dictionaries and get the gradient accumulation factor
+        grad_accumulation_factor = cfg.get("accelerator", {}).get("config_override", {}).get("trainer", {}).get("trainer", {}).get("accumulate_grad_batches", 1)
+        
+        # Ensure that the extracted value is an integer
+        return int(grad_accumulation_factor)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+    # Return default value if an error occurred
+    return 1
+
+def get_training_batch_size(cfg):
+    try:
+        # Navigate through the nested dictionaries and get the training batch size
+        batch_size_training = cfg.get("accelerator", {}).get("config_override", {}).get("datamodule", {}).get("args", {}).get("batch_size_training", 1)
+
+        # Ensure that the extracted value is an integer
+        return int(batch_size_training)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+    # Return default value if an error occurred
+    return 1
+
 
 def run_training_finetuning_testing(cfg: DictConfig) -> None:
     """
@@ -80,6 +122,12 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
 
     st = timeit.default_timer()
 
+    replicas = get_replication_factor(cfg)
+    gradient_acc = get_gradient_accumulation_factor(cfg)
+    micro_bs = get_training_batch_size(cfg)
+    
+    global_bs = replicas * gradient_acc * micro_bs
+    
     # Disable wandb if the user is not logged in.
     wandb_cfg = cfg["constants"].get("wandb")
     if wandb_cfg is not None and wandb.login() is False:
@@ -124,6 +172,9 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
             accelerator_type=accelerator_type,
             featurization=datamodule.featurization,
             task_norms=datamodule.task_norms,
+            replicas=replicas,
+            gradient_acc=gradient_acc,
+            global_bs=global_bs,
         )
 
     logger.info(predictor.model)
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index 0764492ff..342e9f869 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -289,6 +289,9 @@ def load_predictor(
     accelerator_type: str,
     featurization: Dict[str, str] = None,
     task_norms: Optional[Dict[Callable, Any]] = None,
+    replicas: int = 1,
+    gradient_acc: int = 1,
+    global_bs: int = 1,
 ) -> PredictorModule:
     """
     Defining the predictor module, which handles the training logic from `lightning.LighningModule`
@@ -314,6 +317,9 @@ def load_predictor(
         task_levels=task_levels,
         featurization=featurization,
         task_norms=task_norms,
+        replicas=replicas,
+        gradient_acc=gradient_acc,
+        global_bs=global_bs,
         **cfg_pred,
     )
 
@@ -327,6 +333,9 @@ def load_predictor(
             model_class=model_class,
             model_kwargs=scaled_model_kwargs,
             metrics=metrics,
+            replicas=replicas,
+            gradient_acc=gradient_acc,  
+            global_bs=global_bs,
             **cfg_pred,
         )
 
diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index 6f8939e8a..9be10eee0 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -46,6 +46,9 @@ def __init__(
         flag_kwargs: Dict[str, Any] = None,
         task_norms: Optional[Dict[Callable, Any]] = None,
         metrics_every_n_train_steps: Optional[int] = None,
+        replicas: int = 1,
+        gradient_acc: int = 1,
+        global_bs: Optional[int] = 1,
     ):
         """
         The Lightning module responsible for handling the predictions, losses, metrics, optimization, etc.
@@ -175,6 +178,9 @@ def __init__(
         self.metrics_every_n_train_steps = metrics_every_n_train_steps
         # Wether save preds and targets for each training step.
 
+        self.samples_seen = 0
+        self.global_bs = global_bs 
+
     def forward(
         self, inputs: Dict
     ) -> Dict[str, Union[Tensor, Dict[str, Tensor], Dict[str, Dict[str, Tensor]]]]:
@@ -377,6 +383,9 @@ def _general_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool)
         # print("loss ", self.global_step, self.current_epoch, loss)
         step_dict["task_losses"] = task_losses
         step_dict["gradient_norm"] = self.get_gradient_norm()
+        # samuelm
+        # self.samples_seen += 1
+        # step_dict["samples_seen"] = self.samples_seen
         return step_dict
 
     def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> Dict[str, Any]:
@@ -446,6 +455,9 @@ def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> D
         step_dict[f"loss/{step_name}"] = loss.detach().cpu()
         step_dict["loss"] = loss
         step_dict["task_losses"] = task_losses
+        # samuelm
+        # self.samples_seen += 1
+        # step_dict["samples_seen"] = self.samples_seen
         return step_dict
 
     def on_train_batch_start(self, batch: Any, batch_idx: int) -> Optional[int]:
@@ -462,8 +474,9 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None:
         concatenated_metrics_logs = {}
         concatenated_metrics_logs["train/loss"] = outputs["loss"]
         concatenated_metrics_logs["epoch_count"] = self.current_epoch
-        # TODO: Samuelm - we need a number of samples here as well if this works?
-        # import ipdb; ipdb.set_trace()
+        # Incriment by the batch size 
+        self.samples_seen += self.global_bs
+        concatenated_metrics_logs["samples_seen"] = self.samples_seen
 
         # report the training loss for each individual tasks
         for task in self.tasks:

From 127f66ba15a1e95156aa48f96986cb1b59374669 Mon Sep 17 00:00:00 2001
From: kerstink-GC <kerstink@graphcore.ai>
Date: Thu, 12 Oct 2023 15:26:03 +0000
Subject: [PATCH 53/58] updated configs for merge

---
 .../SF_169M_config_LargeMix_mpnn_GPS++.yaml   |  29 +-
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   |   4 +-
 .../SF_590M_config_LargeMix_mpnn_GPS++.yaml   |   2 +
 .../SF_671M_config_LargeMix_mpnn_GPS++.yaml   | 486 ++++++++++++++++++
 4 files changed, 506 insertions(+), 15 deletions(-)
 create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml

diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
index 0b0a562a0..8cbc58681 100644
--- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
@@ -45,27 +45,27 @@ accelerator:
       args:
         ipu_dataloader_training_opts:
           mode: async
-          max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 298
         ipu_dataloader_inference_opts:
           mode: async
-          max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 116
+          max_num_nodes: 140
+          max_num_edges: 298
         # Data handling-related
-        batch_size_training: 8
-        batch_size_inference: 8
+        batch_size_training: 3
+        batch_size_inference: 3
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 8192
+        loss_scaling: 1024
     trainer:
       trainer:
         precision: 16-true
-        accumulate_grad_batches: 16
+        accumulate_grad_batches: 320
 
   ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(2)
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
     - TensorLocations.numIOTiles(128)
@@ -82,8 +82,8 @@ accelerator:
       
   accelerator_kwargs:
     _accelerator: "ipu"
-    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
     #gnn_layers_per_ipu: [4,4,4,4]
 
 datamodule:
@@ -176,6 +176,7 @@ datamodule:
       add_self_loop: False
       explicit_H: False # if H is included
       use_bonds_weights: False
+      max_num_atoms: 100
       pos_encoding_as_features: # encoder dropout 0.18
         pos_types:
           lap_eigvec:
@@ -381,8 +382,8 @@ architecture:
     layer_type: 'pyg:gps'
     layer_kwargs:
       mpnn_type: 'pyg:mpnnplus'
-      node_residual: True
-      edge_residual: True 
+      node_residual: false
+      edge_residual: false
       output_scale: 1.0
       mpnn_kwargs:
         in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
index 6dba7e4ad..692c8e0de 100644
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
@@ -61,7 +61,7 @@ accelerator:
     predictor:
       metrics_every_n_train_steps: 1000
       optim_kwargs:
-        loss_scaling: 1
+        loss_scaling: 1024
     trainer:
       trainer:
         precision: 16-true
@@ -386,6 +386,8 @@ architecture:
     layer_type: 'pyg:gps'
     layer_kwargs:
       node_residual: false
+      edge_residual: false
+      output_scale: 1.0      
       mpnn_type: 'pyg:mpnnplus'
       mpnn_kwargs:
         in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
index 5e537c039..944f6eb0b 100644
--- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
+++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
@@ -387,6 +387,8 @@ architecture:
     layer_type: 'pyg:gps'
     layer_kwargs:
       node_residual: false
+      edge_residual: false
+      output_scale: 1.0      
       mpnn_type: 'pyg:mpnnplus'
       mpnn_kwargs:
         in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
new file mode 100644
index 000000000..6bfc447c4
--- /dev/null
+++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
@@ -0,0 +1,486 @@
+# Running the mpnn model with the largemix dataset on IPU.
+
+# @package _global_
+
+constants:
+  seed: 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  datacache_path: "/localdata/neurips2023-large/"
+  epochs: 100
+  name: SF_671M_sweep_LargeMix_mpnn
+  wandb:
+    entity: multitask-gnn
+    name: SF_671M_sweep_LargeMix_mpnn
+    project: neurips2023_graphcore_scaling_mpnn
+
+    # This whole sections is for minimizing mistakes for the scaling experiments. 
+    # This is the ONLY place where dimensions ahve to change. 
+    # No other dimensions have to be changed in the architecture part.    
+
+dimensions:
+  pre_nn_out_dim: 1024
+  pre_nn_hidden_dims: 4096
+  pre_nn_edges_out_dim: 512
+  pre_nn_edges_hidden_dims: 2048
+  gnn_out_dim: 1024
+  gnn_hidden_dims: 1024
+  mpnn_in_dim: 1024
+  mpnn_out_dim: 1024
+  mpnn_in_dim_edges: 512
+  mpnn_out_dim_edges: 512
+  graph_output_nn_out_dims: 1024
+  graph_output_nn_hidden_dims: 1024
+  node_output_nn_out_dims: 1024
+  node_output_nn_hidden_dims: 1024
+  l1000_vcap_hidden_dims: 512
+  l1000_mcf7_hidden_dims: 512
+  pcba_1328_hidden_dims: 256
+  pcqm4m_g25_hidden_dims: 128
+  pcqm4m_n4_hidden_dims: 128
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 144
+          max_num_edges: 304
+        ipu_dataloader_inference_opts:
+          mode: async
+          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
+          # max_num_edges_per_graph: 116
+          max_num_nodes: 144
+          max_num_edges: 304
+        # Data handling-related
+        batch_size_training: 3
+        batch_size_inference: 2
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 320
+
+  ipu_config:
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(1)
+    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    # - TensorLocations.numIOTiles(128)
+    # - _Popart.set("defaultBufferingDepth", 96)
+    - _Popart.set("saveInitializersToFile", "weights.onnx")
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+    - 'setAvailableMemoryProportion({"IPU0": 0.1})'
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(16)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+  accelerator_kwargs:
+    _accelerator: "ipu"
+    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
+    
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: 42
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    dataloading_from: disk
+    processed_graph_data_path: ${constants.datacache_path}
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      max_num_atoms: 100
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    num_workers: 16 # -1 to use all
+    persistent_workers: True # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: ${constants.seed}
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
+    - name: auroc
+      metric: auroc
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      task: binary
+      multitask_handling: mean-per-label
+      target_nan_mask: ignore
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics 
+
+trainer:
+  seed: ${constants.seed}
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
+    min_epochs: 1
+    check_val_every_n_epoch: 20
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:
+    out_dim: ${dimensions.pre_nn_out_dim}
+    hidden_dims: ${dimensions.pre_nn_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges:
+    out_dim: ${dimensions.pre_nn_edges_out_dim}
+    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: 0.18
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+  #    in_dim: 256 # should be consistent with pre_nn.out_dim
+    out_dim: ${dimensions.gnn_out_dim}
+    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
+    depth: 16
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'sum'
+    use_virtual_edges: true
+    layer_type: 'pyg:gps'
+    layer_kwargs:
+      node_residual: false
+      edge_residual: false
+      output_scale: 1.0      
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
+        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
+      droppath_rate_ffn: 0.3
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: ${dimensions.graph_output_nn_out_dims}
+      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: ${dimensions.node_output_nn_out_dims}
+      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none    
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 2934
+      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+

From e3b21ae9f398f593836e40485e2c5ef0eb2fae5a Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Thu, 12 Oct 2023 18:17:37 +0000
Subject: [PATCH 54/58] removing configs and tidying the edge max limit

---
 .../SF_11M_config_LargeMix_mpnn_GPS++.yaml    | 478 -----------------
 .../SF_169M_config_LargeMix_mpnn_GPS++.yaml   | 479 -----------------
 .../SF_378M_config_LargeMix_mpnn_GPS++.yaml   | 483 -----------------
 .../SF_42M_config_LargeMix_mpnn_GPS++.yaml    | 476 -----------------
 .../SF_590M_config_LargeMix_mpnn_GPS++.yaml   | 484 -----------------
 .../SF_671M_config_LargeMix_mpnn_GPS++.yaml   | 486 ------------------
 expts/foundation_model/__init__.py            |   0
 .../config_LargeMix_mpnn_GPS++.yaml           | 462 -----------------
 .../config_ogbpcq_mpnn_GPS++_newDATA.yaml     | 295 -----------
 expts/iclr2023_configs/__init__.py            |   0
 .../base_config/ogbpcqm4mv2.yaml              | 288 -----------
 .../base_config/ogbpcqm4mv2_GPS++.yaml        | 288 -----------
 .../config_ogbpcq_GCN_16layers.yaml           | 263 ----------
 .../config_ogbpcq_GCN_4layers.yaml            | 263 ----------
 .../config_ogbpcq_GINE_16layers.yaml          | 273 ----------
 .../config_ogbpcq_GINE_4layers.yaml           | 273 ----------
 .../iclr2023_configs/config_ogbpcq_mpnn.yaml  |  57 --
 .../config_ogbpcq_mpnn_GPS++.yaml             | 295 -----------
 .../config_ogbpcq_mpnn_GPS++_newDATA.yaml     | 295 -----------
 ...fig_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml | 295 -----------
 .../config_ogbpcq_mpnn_JosefOG.yaml           |  57 --
 .../config_ogbpcq_mpnn_hydradims.yaml         | 101 ----
 .../config_ogbpcq_mpnn_largerffn.yaml         |  58 ---
 graphium/features/featurizer.py               |   3 -
 24 files changed, 6452 deletions(-)
 delete mode 100644 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/__init__.py
 delete mode 100644 expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
 delete mode 100644 expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml
 delete mode 100644 expts/iclr2023_configs/__init__.py
 delete mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
 delete mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
 delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml

diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 124b1d250..000000000
--- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,478 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_11M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_11M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: &gnn_dim 256 # original 256
-  pre_nn_hidden_dims: 1024 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128
-  pre_nn_edges_hidden_dims: 512 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 128 # original 128
-  l1000_mcf7_hidden_dims: 128 # original 128
-  pcba_1328_hidden_dims: 64 # original 64
-  pcqm4m_g25_hidden_dims: 32 # original 32
-  pcqm4m_n4_hidden_dims: 32 # original 32
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 2
-
-  ipu_config:
-    - deviceIterations(16) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-      #  accelerator_kwargs:
-      #_accelerator: "ipu"
-    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    #gnn_layers_per_ipu: [4,4,4,4]
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: ./models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: True
-      edge_residual: True 
-      output_scale: 1.0
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 8cbc58681..000000000
--- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,479 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_169M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_169M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: &gnn_dim 512 # original 256
-  pre_nn_hidden_dims: 2048 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 256 # original 128
-  pre_nn_edges_hidden_dims: 1024 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 256 # original 128
-  l1000_mcf7_hidden_dims: 256 # original 128
-  pcba_1328_hidden_dims: 128 # original 64
-  pcqm4m_g25_hidden_dims: 64 # original 32
-  pcqm4m_n4_hidden_dims: 64 # original 32
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes: 140
-          max_num_edges: 298
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes: 140
-          max_num_edges: 298
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 3
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-      
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    #gnn_layers_per_ipu: [4,4,4,4]
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-    # in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      mpnn_type: 'pyg:mpnnplus'
-      node_residual: false
-      edge_residual: false
-      output_scale: 1.0
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 692c8e0de..000000000
--- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,483 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_378M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_378M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: 768
-  pre_nn_hidden_dims: 3072
-  pre_nn_edges_out_dim: 384
-  pre_nn_edges_hidden_dims: 1536
-  gnn_out_dim: 768
-  gnn_hidden_dims: 768
-  mpnn_in_dim: 768
-  mpnn_out_dim: 768
-  mpnn_in_dim_edges: 384
-  mpnn_out_dim_edges: 384
-  graph_output_nn_out_dims: 768
-  graph_output_nn_hidden_dims: 768
-  node_output_nn_out_dims: 768
-  node_output_nn_hidden_dims: 768
-  l1000_vcap_hidden_dims: 384
-  l1000_mcf7_hidden_dims: 384
-  pcba_1328_hidden_dims: 192
-  pcqm4m_g25_hidden_dims: 96
-  pcqm4m_n4_hidden_dims: 96
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 298
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 298
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 3
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    # - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(16)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.        
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      edge_residual: false
-      output_scale: 1.0      
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 04c393733..000000000
--- a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,476 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_42M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_42M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: &gnn_dim 256 # original 256
-  pre_nn_hidden_dims: 1024 # original 1024
-  pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128
-  pre_nn_edges_hidden_dims: 512 # original 512
-  gnn_out_dim: *gnn_dim # original 256
-  gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-  mpnn_in_dim: *gnn_dim # original 256
-  mpnn_out_dim: *gnn_dim # original 256
-  mpnn_in_dim_edges: *gnn_dim_edges # original 128
-  mpnn_out_dim_edges: *gnn_dim_edges # original 128
-  graph_output_nn_out_dims: *gnn_dim # original 256
-  graph_output_nn_hidden_dims: *gnn_dim # original 256
-  node_output_nn_out_dims: *gnn_dim # original 256
-  node_output_nn_hidden_dims: *gnn_dim  # original 256
-  l1000_vcap_hidden_dims: 128 # original 128
-  l1000_mcf7_hidden_dims: 128 # original 128
-  pcba_1328_hidden_dims: 64 # original 64
-  pcqm4m_g25_hidden_dims: 32 # original 32
-  pcqm4m_n4_hidden_dims: 32 # original 32
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 16
-        batch_size_inference: 16
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 16
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    gnn_layers_per_ipu: [4,4,4,4]
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-    # in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 944f6eb0b..000000000
--- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,484 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_590M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_590M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: 960
-  pre_nn_hidden_dims: 3840
-  pre_nn_edges_out_dim: 480
-  pre_nn_edges_hidden_dims: 1920
-  gnn_out_dim: 960
-  gnn_hidden_dims: 960
-  mpnn_in_dim: 960
-  mpnn_out_dim: 960
-  mpnn_in_dim_edges: 480
-  mpnn_out_dim_edges: 480
-  graph_output_nn_out_dims: 960
-  graph_output_nn_hidden_dims: 960
-  node_output_nn_out_dims: 960
-  node_output_nn_hidden_dims: 960
-  l1000_vcap_hidden_dims: 480
-  l1000_mcf7_hidden_dims: 480
-  pcba_1328_hidden_dims: 240
-  pcqm4m_g25_hidden_dims: 120
-  pcqm4m_n4_hidden_dims: 120
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 298
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 140
-          max_num_edges: 298
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 3
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    # - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(16)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      edge_residual: false
-      output_scale: 1.0      
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index 6bfc447c4..000000000
--- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,486 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: SF_671M_sweep_LargeMix_mpnn
-  wandb:
-    entity: multitask-gnn
-    name: SF_671M_sweep_LargeMix_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-    # This whole sections is for minimizing mistakes for the scaling experiments. 
-    # This is the ONLY place where dimensions ahve to change. 
-    # No other dimensions have to be changed in the architecture part.    
-
-dimensions:
-  pre_nn_out_dim: 1024
-  pre_nn_hidden_dims: 4096
-  pre_nn_edges_out_dim: 512
-  pre_nn_edges_hidden_dims: 2048
-  gnn_out_dim: 1024
-  gnn_hidden_dims: 1024
-  mpnn_in_dim: 1024
-  mpnn_out_dim: 1024
-  mpnn_in_dim_edges: 512
-  mpnn_out_dim_edges: 512
-  graph_output_nn_out_dims: 1024
-  graph_output_nn_hidden_dims: 1024
-  node_output_nn_out_dims: 1024
-  node_output_nn_hidden_dims: 1024
-  l1000_vcap_hidden_dims: 512
-  l1000_mcf7_hidden_dims: 512
-  pcba_1328_hidden_dims: 256
-  pcqm4m_g25_hidden_dims: 128
-  pcqm4m_n4_hidden_dims: 128
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 144
-          max_num_edges: 304
-        ipu_dataloader_inference_opts:
-          mode: async
-          # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118
-          # max_num_edges_per_graph: 116
-          max_num_nodes: 144
-          max_num_edges: 304
-        # Data handling-related
-        batch_size_training: 3
-        batch_size_inference: 2
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 320
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - 'setAvailableMemoryProportion({"IPU0": 0.05})'
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    # - TensorLocations.numIOTiles(128)
-    # - _Popart.set("defaultBufferingDepth", 96)
-    - _Popart.set("saveInitializersToFile", "weights.onnx")
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-    - 'setAvailableMemoryProportion({"IPU0": 0.1})'
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(16)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-    # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2]
-    
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      max_num_atoms: 100
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 16 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: ${dimensions.pre_nn_out_dim}
-    hidden_dims: ${dimensions.pre_nn_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: ${dimensions.pre_nn_edges_out_dim}
-    hidden_dims: ${dimensions.pre_nn_edges_hidden_dims}
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: ${dimensions.gnn_out_dim}
-    hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      edge_residual: false
-      output_scale: 1.0      
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: ${dimensions.mpnn_out_dim}  # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: ${dimensions.mpnn_in_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: ${dimensions.mpnn_out_dim_edges}  # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: ${dimensions.graph_output_nn_out_dims}
-      hidden_dims: ${dimensions.graph_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: ${dimensions.node_output_nn_out_dims}
-      hidden_dims: ${dimensions.node_output_nn_hidden_dims}
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none    
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_vcap_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: ${dimensions.l1000_mcf7_hidden_dims}
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: ${dimensions.pcba_1328_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims}
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
diff --git a/expts/foundation_model/__init__.py b/expts/foundation_model/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
deleted file mode 100644
index ce6914154..000000000
--- a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,462 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/neurips2023-large/"
-  epochs: 100
-  name: LargeMix_mpnn_40M
-  wandb:
-    entity: multitask-gnn
-    name: LargeMix_mpnn_40M
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 16
-        batch_size_inference: 16
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 16
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4,4,4,4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: 42
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 3
-      alpha: ${predictor.loss_fun.l1000_vcap.alpha}
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      num_classes: 3
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-  # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly
-    - name: auroc
-      metric: auroc
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-    - name: avpr
-      metric: averageprecision
-      task: binary
-      multitask_handling: mean-per-label
-      target_nan_mask: ignore
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics 
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 20
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 2934
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-      #  task_heads:
-      #    homolumo:
-      #      task_level: graph
-      #      out_dim: 1
-      #      hidden_dims: 256
-      #      depth: 2
-      #      activation: relu
-      #      last_activation: none
-      #      dropout: *dropout
-      #      normalization: *normalization
-      #      last_normalization: "none"
-      #      residual_type: none
diff --git a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml
deleted file mode 100644
index b97402680..000000000
--- a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml
+++ /dev/null
@@ -1,295 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_mpnn_NewData
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn_NewData
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "smiles"
-        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/__init__.py b/expts/iclr2023_configs/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
deleted file mode 100644
index 571788581..000000000
--- a/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml
+++ /dev/null
@@ -1,288 +0,0 @@
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       args:
-#         batch_size_training: 64
-#         batch_size_inference: 256
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk" 
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null  # Set as null to avoid a pre-nn network
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 768
-    hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []  
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-  
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
deleted file mode 100644
index 09174a096..000000000
--- a/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml
+++ /dev/null
@@ -1,288 +0,0 @@
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       args:
-#         batch_size_training: 64
-#         batch_size_inference: 256
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk" 
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null  # Set as null to avoid a pre-nn network
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 1024
-    hidden_dims: *gnn_dim
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 512
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []  
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-  
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
deleted file mode 100644
index bffa2ee04..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml
+++ /dev/null
@@ -1,263 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_gcn
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_gcn
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 768
-    hidden_dims: *gnn_dim
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
deleted file mode 100644
index ef46cda2a..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml
+++ /dev/null
@@ -1,263 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_gcn
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_gcn
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 2
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-      #  accelerator_kwargs:
-      #    _accelerator: "ipu"
-      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 768
-    hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
deleted file mode 100644
index bde17fe0d..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml
+++ /dev/null
@@ -1,273 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_gine
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_gine
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:   # Set as null to avoid a pre-nn network
-    out_dim: 32
-    hidden_dims: 128
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: *dropout
-    normalization: *normalization
-    last_normalization: *normalization
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    out_dim: &gnn_dim 704
-    hidden_dims: *gnn_dim
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml
deleted file mode 100644
index ff173adde..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml
+++ /dev/null
@@ -1,273 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_gine
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_gine
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 2
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-      #  accelerator_kwargs:
-      #    _accelerator: "ipu"
-      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:   # Set as null to avoid a pre-nn network
-    out_dim: 32
-    hidden_dims: 128
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: *dropout
-    normalization: *normalization
-    last_normalization: *normalization
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    out_dim: &gnn_dim 704
-    hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
deleted file mode 100644
index e57a403d7..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-defaults:
- - base_config: ogbpcqm4mv2
- - _self_
-
-constants:
-  name: ogb_pcqm4mv2_mpnn
-
-architecture:
-
-  pre_nn:
-    out_dim: 160
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 64
-    hidden_dims: 128
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 160 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
deleted file mode 100644
index 61f645c44..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml
+++ /dev/null
@@ -1,295 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_mpnn_no1hot
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn_no1hot
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-    #  atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [atomic-number, group, period, total-valence, degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
deleted file mode 100644
index b97402680..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml
+++ /dev/null
@@ -1,295 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_mpnn_NewData
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn_NewData
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 8
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(4)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(30)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-  accelerator_kwargs:
-    _accelerator: "ipu"
-    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "smiles"
-        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 16
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml
deleted file mode 100644
index 94535b7cc..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml
+++ /dev/null
@@ -1,295 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-# @package _global_
-
-constants:
-  seed: 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  datacache_path: "/localdata/PCQM4Mv2/"
-  epochs: 100
-  name: ogb_pcqm4mv2_mpnn_NewData_4layers
-  wandb:
-    entity: multitask-gnn
-    name: neurips2023_scaling_mpnn_NewData_4layers
-    project: neurips2023_graphcore_scaling_mpnn
-
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 2
-
-  ipu_config:
-    - deviceIterations(60) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(1)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-      #  accelerator_kwargs:
-      #    _accelerator: "ipu"
-      #    gnn_layers_per_ipu: [4, 4, 4, 4]      
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: graph
-        df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv
-        # Download with  `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv`
-        smiles_col: "smiles"
-        label_cols: ["homolumogap"]  # graph_*  # graph_* means all columns starting with "graph_"
-        #        sample_size: 100000 # use sample_size for test
-        splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        split_names: ["train", "valid", "test-dev"]
-        seed: ${constants.seed}
-        label_normalization:
-          normalize_val_test: true
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: "disk"
-    processed_graph_data_path: ${constants.datacache_path}
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: []
-  metrics_on_training_set:
-    homolumo: []
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: ${constants.seed}
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: ${constants.epochs}
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  homolumo:
-      - name: mae
-        metric: mae_ipu
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-        threshold_kwargs: null
-      - name: pearsonr
-        metric: pearsonr_ipu
-        threshold_kwargs: null
-        target_nan_mask: null
-        multitask_handling: mean-per-label
-
-trainer:
-  seed: ${constants.seed}
-  logger:
-    save_dir: logs/ogb_pcqm4mv2/
-    name: ${constants.name}
-    project: ${constants.name}
-  model_checkpoint:
-    dirpath: models_checkpoints/${constants.name}/
-    filename: ${constants.name}
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: ${constants.epochs}
-    min_epochs: 1
-    check_val_every_n_epoch: 5
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-  gnn:  # Set as null to avoid a post-nn network
-  #    in_dim: 256 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'sum'
-    use_virtual_edges: true
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-      droppath_rate_ffn: 0.3
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    homolumo:
-      task_level: graph
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
deleted file mode 100644
index e57a403d7..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-defaults:
- - base_config: ogbpcqm4mv2
- - _self_
-
-constants:
-  name: ogb_pcqm4mv2_mpnn
-
-architecture:
-
-  pre_nn:
-    out_dim: 160
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 64
-    hidden_dims: 128
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 160 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
deleted file mode 100644
index 0dbea9917..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-defaults:
- - base_config: ogbpcqm4mv2
- - _self_
-
-constants:
-  name: ogb_pcqm4mv2_mpnn
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:   # Set as null to avoid a pre-nn network
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: *normalization
-    last_normalization: *normalization
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    out_dim: 256
-    hidden_dims: 256
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    layer_type: 'pyg:gps'
-    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256
-        out_dim: 256
-        in_dim_edges: 128
-        out_dim_edges: 128
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-    virtual_node: 'sum'
-    use_virtual_edges: true
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: 256
-      hidden_dims: 256
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-  
diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml
deleted file mode 100644
index 1a51857fe..000000000
--- a/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# Running the mpnn model with the largemix dataset on IPU.
-
-defaults:
- - base_config: ogbpcqm4mv2
- - _self_
-
-constants:
-  name: ogb_pcqm4mv2_mpnn
-
-architecture:
-
-  pre_nn:
-    out_dim: 280
-    hidden_dims: 512 
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:
-    out_dim: 64
-    hidden_dims: 128
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: 0.18
-    normalization: ${architecture.pre_nn.normalization}
-    last_normalization: ${architecture.pre_nn.normalization}
-    residual_type: none
-
-  gnn:  # Set as null to avoid a post-nn network
-    in_dim: 280 # should be consistent with pre_nn.out_dim
-    out_dim: 256
-    hidden_dims: &gnn_dim 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer)
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    layer_type: 'pyg:gps'
-    layer_kwargs:
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        out_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer)
-        in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-        out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer)
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-    virtual_node: 'sum'
-    use_virtual_edges: true  
diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py
index cb19e3b4b..d8efdb2ab 100644
--- a/graphium/features/featurizer.py
+++ b/graphium/features/featurizer.py
@@ -1062,10 +1062,7 @@ def mol_to_graph_dict(
             mol = Chem.AddHs(mol)
         else:
             mol = Chem.RemoveHs(mol)
-        # SAMUELM: Temp fix
-        max_num_bonds = 265
         num_atoms = mol.GetNumAtoms()
-        num_bonds = mol.GetNumBonds()
         if (max_num_atoms is not None) and (num_atoms > max_num_atoms):
             raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}")
         (

From 120a447f5bab47f0eebd816d191f0f2b43192d6b Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <samuelm@graphcore.ai>
Date: Thu, 12 Oct 2023 18:22:00 +0000
Subject: [PATCH 55/58] Linting

---
 graphium/cli/train_finetune_test.py | 29 ++++++++++++++++++++++-------
 graphium/config/_loader.py          |  2 +-
 graphium/trainer/predictor.py       |  4 ++--
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py
index 4a2de6a04..d21fe3c3c 100644
--- a/graphium/cli/train_finetune_test.py
+++ b/graphium/cli/train_finetune_test.py
@@ -51,6 +51,7 @@ def cli(cfg: DictConfig) -> None:
     """
     return run_training_finetuning_testing(cfg)
 
+
 def get_replication_factor(cfg):
     try:
         ipu_config = cfg.get("accelerator", {}).get("ipu_config", [])
@@ -63,15 +64,22 @@ def get_replication_factor(cfg):
                     return int(item[start:end])
     except Exception as e:
         print(f"An error occurred: {e}")
-    
+
     # Return default value if replicationFactor is not found or an error occurred
     return 1
 
+
 def get_gradient_accumulation_factor(cfg):
     try:
         # Navigate through the nested dictionaries and get the gradient accumulation factor
-        grad_accumulation_factor = cfg.get("accelerator", {}).get("config_override", {}).get("trainer", {}).get("trainer", {}).get("accumulate_grad_batches", 1)
-        
+        grad_accumulation_factor = (
+            cfg.get("accelerator", {})
+            .get("config_override", {})
+            .get("trainer", {})
+            .get("trainer", {})
+            .get("accumulate_grad_batches", 1)
+        )
+
         # Ensure that the extracted value is an integer
         return int(grad_accumulation_factor)
     except Exception as e:
@@ -80,10 +88,17 @@ def get_gradient_accumulation_factor(cfg):
     # Return default value if an error occurred
     return 1
 
+
 def get_training_batch_size(cfg):
     try:
         # Navigate through the nested dictionaries and get the training batch size
-        batch_size_training = cfg.get("accelerator", {}).get("config_override", {}).get("datamodule", {}).get("args", {}).get("batch_size_training", 1)
+        batch_size_training = (
+            cfg.get("accelerator", {})
+            .get("config_override", {})
+            .get("datamodule", {})
+            .get("args", {})
+            .get("batch_size_training", 1)
+        )
 
         # Ensure that the extracted value is an integer
         return int(batch_size_training)
@@ -107,7 +122,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
     # Format the datetime as a string
     filename_datetime_suffix = now.strftime("%Y%m%d_%H%M%S")
     # Append the datetime string to the existing filename in the cfg dictionary
-    cfg['trainer']['model_checkpoint']['filename'] += f"_{filename_datetime_suffix}"
+    cfg["trainer"]["model_checkpoint"]["filename"] += f"_{filename_datetime_suffix}"
 
     dst_dir = cfg["constants"].get("results_dir")
     hydra_cfg = HydraConfig.get()
@@ -129,9 +144,9 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None:
     replicas = get_replication_factor(cfg)
     gradient_acc = get_gradient_accumulation_factor(cfg)
     micro_bs = get_training_batch_size(cfg)
-    
+
     global_bs = replicas * gradient_acc * micro_bs
-    
+
     # Disable wandb if the user is not logged in.
     wandb_cfg = cfg["constants"].get("wandb")
     if wandb_cfg is not None and wandb.login() is False:
diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py
index fc005890c..cf2c80a3f 100644
--- a/graphium/config/_loader.py
+++ b/graphium/config/_loader.py
@@ -337,7 +337,7 @@ def load_predictor(
             featurization=featurization,
             task_norms=task_norms,
             replicas=replicas,
-            gradient_acc=gradient_acc,  
+            gradient_acc=gradient_acc,
             global_bs=global_bs,
             **cfg_pred,
         )
diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index fe270bc44..dd6e77002 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -179,7 +179,7 @@ def __init__(
         # Wether save preds and targets for each training step.
 
         self.samples_seen = 0
-        self.global_bs = global_bs 
+        self.global_bs = global_bs
 
     def forward(
         self, inputs: Dict
@@ -475,7 +475,7 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None:
         concatenated_metrics_logs = {}
         concatenated_metrics_logs["train/loss"] = outputs["loss"]
         concatenated_metrics_logs["epoch_count"] = self.current_epoch
-        # Incriment by the batch size 
+        # Incriment by the batch size
         self.samples_seen += self.global_bs
         concatenated_metrics_logs["samples_seen"] = self.samples_seen
 

From bef2b61f4b8dc960b816337e9706c73a462bd537 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com>
Date: Thu, 12 Oct 2023 19:36:08 +0100
Subject: [PATCH 56/58] Update graphium/trainer/predictor.py

---
 graphium/trainer/predictor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index dd6e77002..db029c345 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -456,9 +456,6 @@ def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> D
         step_dict[f"loss/{step_name}"] = loss.detach().cpu()
         step_dict["loss"] = loss
         step_dict["task_losses"] = task_losses
-        # samuelm
-        # self.samples_seen += 1
-        # step_dict["samples_seen"] = self.samples_seen
         return step_dict
 
     def on_train_batch_start(self, batch: Any, batch_idx: int) -> Optional[int]:

From 97558ffc7299c64f923f8c0e739a60b7c3450b48 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com>
Date: Thu, 12 Oct 2023 19:36:18 +0100
Subject: [PATCH 57/58] Update graphium/nn/pyg_layers/gps_pyg.py

---
 graphium/nn/pyg_layers/gps_pyg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
index 6982d6750..7af7107ac 100644
--- a/graphium/nn/pyg_layers/gps_pyg.py
+++ b/graphium/nn/pyg_layers/gps_pyg.py
@@ -147,7 +147,7 @@ def __init__(
                 Keyword arguments to pass to the attention layer
 
             output_scale:
-                Float value that will be used to scale the activations, helps reudce growth of activations
+                Float value that will be used to scale the activations, helps reduce growth of activations
 
                 as the model gets deeper. Default value of 1.0 leaves the layer unchanged.
 

From 4002482b0821a2cba71f942510e9d94fd57346a5 Mon Sep 17 00:00:00 2001
From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com>
Date: Thu, 12 Oct 2023 19:36:25 +0100
Subject: [PATCH 58/58] Update graphium/trainer/predictor.py

---
 graphium/trainer/predictor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py
index db029c345..588d7e3f2 100644
--- a/graphium/trainer/predictor.py
+++ b/graphium/trainer/predictor.py
@@ -384,9 +384,6 @@ def _general_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool)
         # print("loss ", self.global_step, self.current_epoch, loss)
         step_dict["task_losses"] = task_losses
         step_dict["gradient_norm"] = self.get_gradient_norm()
-        # samuelm
-        # self.samples_seen += 1
-        # step_dict["samples_seen"] = self.samples_seen
         return step_dict
 
     def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> Dict[str, Any]: