diff --git a/expts/neurips2023_configs/base_config/large.yaml b/expts/neurips2023_configs/base_config/large.yaml
new file mode 100644
index 000000000..da9760ab3
--- /dev/null
+++ b/expts/neurips2023_configs/base_config/large.yaml
@@ -0,0 +1,423 @@
+# @package _global_
+
+constants:
+  seed: &seed 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 100
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 100
+        # Data handling-related
+        batch_size_training: 30
+        batch_size_inference: 30
+    predictor:
+      metrics_every_n_train_steps: 1000
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16-true
+        accumulate_grad_batches: 2
+
+  ipu_config:
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+    # - Precision.enableFloatingPointExceptions(True)
+
+  ipu_inference_config:
+  # set device iteration and replication factor to 1 during inference
+  # gradient accumulation was set to 1 in the code
+    - deviceIterations(1)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
+
+# accelerator:
+#   type: cpu  # cpu or ipu or gpu
+#   config_override:
+#     datamodule:
+#       args:
+#         batch_size_training: 64
+#         batch_size_inference: 256
+#     trainer:
+#       trainer:
+#         precision: 32
+#         accumulate_grad_batches: 1
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      l1000_mcf7:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcba_1328:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_g25:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: graph_*  # graph_* means all columns starting with "graph_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+      pcqm4m_n4:
+        df: null
+        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # or set path as the URL directly
+        smiles_col: "ordered_smiles"
+        label_cols: node_* # node_* means all columns starting with "node_"
+        # sample_size: 2000 # use sample_size for test
+        task_level: node
+        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        seed: *seed
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+        epoch_sampling_fraction: 1.0
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    processed_graph_data_path: "../datacache/neurips2023-large/"
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 32 # -1 to use all
+    persistent_workers: True # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 768
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    l1000_vcap:
+      task_level: graph
+      out_dim: 4890
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    l1000_mcf7:
+      task_level: graph
+      out_dim: 4890
+      hidden_dims: 128
+      depth: 2
+      activation: none
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcba_1328:
+      task_level: graph
+      out_dim: 1328
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_g25:
+      task_level: graph
+      out_dim: 25
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    pcqm4m_n4:
+      task_level: node
+      out_dim: 4
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  metrics_on_training_set:
+    l1000_vcap: []
+    l1000_mcf7: []
+    pcba_1328: []
+    pcqm4m_g25: []
+    pcqm4m_n4: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 5
+      alpha: 0.5
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 5
+      alpha: 0.5
+    pcba_1328: bce_logits_ipu
+    pcqm4m_g25: mae_ipu
+    pcqm4m_n4: mae_ipu
+  random_seed: *seed
+  optim_kwargs:
+    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  l1000_vcap: &classif_metrics
+    - name: auroc
+      metric: auroc_ipu
+      num_classes: 5
+      task: multiclass
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: average_precision_ipu
+      num_classes: 5
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  l1000_mcf7: *classif_metrics
+  pcba_1328:
+    - name: auroc
+      metric: auroc_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: average_precision_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  pcqm4m_g25: &pcqm_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2
+      metric: r2_score_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+  pcqm4m_n4: *pcqm_metrics
+
+trainer:
+  seed: *seed
+  logger:
+    save_dir: logs/neurips2023-large/
+    name: ${constants.name}
+    project: ${constants.name}
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: *max_epochs
+    min_epochs: 1
+    check_val_every_n_epoch: 20
diff --git a/expts/neurips2023_configs/base_config/small.yaml b/expts/neurips2023_configs/base_config/small.yaml
new file mode 100644
index 000000000..2e63477a1
--- /dev/null
+++ b/expts/neurips2023_configs/base_config/small.yaml
@@ -0,0 +1,343 @@
+# @package _global_
+
+constants:
+  seed: &seed 42
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
+
+accelerator:
+  type: ipu  # cpu or ipu or gpu
+  config_override:
+    datamodule:
+      args:
+        ipu_dataloader_training_opts:
+          mode: async
+          max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54
+          max_num_edges_per_graph: 80
+        ipu_dataloader_inference_opts:
+          mode: async
+          max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118
+          max_num_edges_per_graph: 80
+        # Data handling-related
+        batch_size_training: 50
+        batch_size_inference: 50
+    predictor:
+      optim_kwargs:
+        loss_scaling: 1024
+    trainer:
+      trainer:
+        precision: 16
+        accumulate_grad_batches: 4
+
+  ipu_config:
+    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - replicationFactor(16)
+    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
+    # - enableExecutableCaching("pop_compiler_cache")
+    - TensorLocations.numIOTiles(128)
+    - _Popart.set("defaultBufferingDepth", 128)
+    - Precision.enableStochasticRounding(True)
+
+# accelerator:
+#   type: cpu  # cpu or ipu or gpu
+#   config_override:
+#     datamodule:
+#       batch_size_training: 64
+#       batch_size_inference: 256
+#     trainer:
+#       trainer:
+#         precision: 32
+#         accumulate_grad_batches: 1
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  # module_type: "FakeDataModule"  # Option to use generated data
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      qm9:
+        df: null
+        df_path: data/neurips2023/small-dataset/qm9.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt`
+        seed: *seed
+        task_level: graph
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+
+      tox21:
+        df: null
+        df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt`
+        seed: *seed
+        task_level: graph
+
+      zinc:
+        df: null
+        df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
+        # or set path as the URL directly
+        smiles_col: "smiles"
+        label_cols: ["SA", "logp", "score"]
+        # sample_size: 2000 # use sample_size for test
+        splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt`
+        seed: *seed
+        task_level: graph
+        label_normalization:
+          normalize_val_test: True
+          method: "normal"
+
+    # Featurization
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 30
+    featurization_progress: True
+    featurization_backend: "loky"
+    processed_graph_data_path: "../datacache/neurips2023-small/"
+    featurization:
+    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
+    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
+    # 'num_chiral_centers (not included yet)']
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
+
+    # cache_data_path: .
+    num_workers: 30 # -1 to use all
+    persistent_workers: False # if use persistent worker at the start of each epoch.
+    # Using persistent_workers false might make the start of each epoch very long.
+
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.18
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null   # Set as null to avoid a pre-nn network
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 96
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+    layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1
+
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+  task_heads:
+    qm9:
+      task_level: graph
+      out_dim: 19
+      hidden_dims: 128
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    tox21:
+      task_level: graph
+      out_dim: 12
+      hidden_dims: 64
+      depth: 2
+      activation: relu
+      last_activation: sigmoid
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    zinc:
+      task_level: graph
+      out_dim: 3
+      hidden_dims: 32
+      depth: 2
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+#Task-specific
+predictor:
+  metrics_on_progress_bar:
+    qm9: ["mae"]
+    tox21: ["auroc"]
+    zinc: ["mae"]
+  loss_fun:
+    qm9: mae_ipu
+    tox21: bce_ipu
+    zinc: mae_ipu
+  random_seed: *seed
+  optim_kwargs:
+    lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs
+    # weight_decay: 1.e-7
+  torch_scheduler_kwargs:
+    module_type: WarmUpLinearLR
+    max_num_epochs: &max_epochs 100
+    warmup_epochs: 10
+    verbose: False
+  scheduler_kwargs:
+  #  monitor: &monitor qm9/mae/train
+  #  mode: min
+  #  frequency: 1
+  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
+  multitask_handling: flatten # flatten, mean-per-label
+
+# Task-specific
+metrics:
+  qm9: &qm9_metrics
+    - name: mae
+      metric: mae_ipu
+      target_nan_mask: null
+      multitask_handling: flatten
+      threshold_kwargs: null
+    - name: pearsonr
+      metric: pearsonr_ipu
+      threshold_kwargs: null
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+    - name: r2_score
+      metric: r2_score_ipu
+      target_nan_mask: null
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+  tox21:
+    - name: auroc
+      metric: auroc_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: average_precision_ipu
+      task: binary
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: f1 > 0.5
+      metric: f1
+      multitask_handling: mean-per-label
+      target_to_int: True
+      num_classes: 2
+      average: micro
+      threshold_kwargs: &threshold_05
+        operator: greater
+        threshold: 0.5
+        th_on_preds: True
+        th_on_target: True
+    - name: precision > 0.5
+      metric: precision
+      multitask_handling: mean-per-label
+      average: micro
+      threshold_kwargs: *threshold_05
+  zinc: *qm9_metrics
+
+trainer:
+  seed: *seed
+  logger:
+    save_dir: logs/neurips2023-small/
+    name: ${constants.name}
+    project: ${constants.name}
+  #early_stopping:
+  #  monitor: *monitor
+  #  min_delta: 0
+  #  patience: 10
+  #  mode: &mode min
+  model_checkpoint:
+    dirpath: models_checkpoints/${constants.name}/
+    filename: ${constants.name}
+    # monitor: *monitor
+    # mode: *mode
+    # save_top_k: 1
+    save_last: True
+  trainer:
+    max_epochs: *max_epochs
+    min_epochs: 1
+    check_val_every_n_epoch: 20
diff --git a/expts/neurips2023_configs/config_large_gcn.yaml b/expts/neurips2023_configs/config_large_gcn.yaml
index 033b8a5f5..1dc397998 100644
--- a/expts/neurips2023_configs/config_large_gcn.yaml
+++ b/expts/neurips2023_configs/config_large_gcn.yaml
@@ -1,424 +1,12 @@
 # Running the gcn model with the largemix dataset on IPU.
-constants:
-  name: &name neurips2023_large_data_gcn
-  seed: &seed 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  entity: multitask-gnn
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 100
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 100
-        # Data handling-related
-        batch_size_training: 30
-        batch_size_inference: 30
-    predictor:
-      metrics_every_n_train_steps: 1000
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 2
-
-  ipu_config:
-    - deviceIterations(30) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 96)
-    - Precision.enableStochasticRounding(True)
-    # - Precision.enableFloatingPointExceptions(True)
-
-  ipu_inference_config:
-  # set device iteration and replication factor to 1 during inference
-  # gradient accumulation was set to 1 in the code
-    - deviceIterations(1)
-    - replicationFactor(1)
-    - Precision.enableStochasticRounding(False)
-
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       args:
-#         batch_size_training: 64
-#         batch_size_inference: 256
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      l1000_vcap:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
-        epoch_sampling_fraction: 1.0
 
-      l1000_mcf7:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcba_1328:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
-        # or set path as the URL directly
-        smiles_col: "SMILES"
-        label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_g25:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: graph_*  # graph_* means all columns starting with "graph_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: graph
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-      pcqm4m_n4:
-        df: null
-        df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
-        # or set path as the URL directly
-        smiles_col: "ordered_smiles"
-        label_cols: node_* # node_* means all columns starting with "node_"
-        # sample_size: 2000 # use sample_size for test
-        task_level: node
-        splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
-        seed: *seed
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-        epoch_sampling_fraction: 1.0
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    processed_graph_data_path: "../datacache/neurips2023-large/"
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 32 # -1 to use all
-    persistent_workers: True # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
+defaults:
+  - base_config: large
+  - _self_
 
+constants:
+  name: neurips2023_large_data_gcn
 
 architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
   gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 768
-    hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    node:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    l1000_vcap:
-      task_level: graph
-      out_dim: 4890
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    l1000_mcf7:
-      task_level: graph
-      out_dim: 4890
-      hidden_dims: 128
-      depth: 2
-      activation: none
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcba_1328:
-      task_level: graph
-      out_dim: 1328
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_g25:
-      task_level: graph
-      out_dim: 25
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    pcqm4m_n4:
-      task_level: node
-      out_dim: 4
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  metrics_on_training_set:
-    l1000_vcap: []
-    l1000_mcf7: []
-    pcba_1328: []
-    pcqm4m_g25: []
-    pcqm4m_n4: []
-  loss_fun:
-    l1000_vcap:
-      name: hybrid_ce_ipu
-      n_brackets: 5
-      alpha: 0.5
-    l1000_mcf7:
-      name: hybrid_ce_ipu
-      n_brackets: 5
-      alpha: 0.5
-    pcba_1328: bce_logits_ipu
-    pcqm4m_g25: mae_ipu
-    pcqm4m_n4: mae_ipu
-  random_seed: *seed
-  optim_kwargs:
-    lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  l1000_vcap: &classif_metrics
-    - name: auroc
-      metric: auroc_ipu
-      num_classes: 5
-      task: multiclass
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: average_precision_ipu
-      num_classes: 5
-      task: multiclass
-      target_to_int: True
-      target_nan_mask: -1000
-      ignore_index: -1000
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  l1000_mcf7: *classif_metrics
-  pcba_1328:
-    - name: auroc
-      metric: auroc_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: average_precision_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  pcqm4m_g25: &pcqm_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2
-      metric: r2_score_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-  pcqm4m_n4: *pcqm_metrics
-
-trainer:
-  seed: *seed
-  logger:
-    save_dir: logs/neurips2023-large/
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-large-gcn/
-    filename: *name
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: *max_epochs
-    min_epochs: 1
-    check_val_every_n_epoch: 20
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
\ No newline at end of file
diff --git a/expts/neurips2023_configs/config_large_gcn_gpu.yaml b/expts/neurips2023_configs/config_large_gcn_gpu.yaml
index d0fa82a94..2830530aa 100644
--- a/expts/neurips2023_configs/config_large_gcn_gpu.yaml
+++ b/expts/neurips2023_configs/config_large_gcn_gpu.yaml
@@ -1,7 +1,15 @@
 # Testing GCN on LargeMix with FP16/32 on GPU
+
+defaults:
+  - base_config: large
+  - _self_
+
 constants:
-  name: &name neurips2023_large_data_gcn_gpu
-  config_override: "expts/neurips2023_configs/config_large_gcn.yaml"
+  name: neurips2023_large_data_gcn_gpu
+
+architecture:
+  gnn:  # Set as null to avoid a post-nn network
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
 
 accelerator:
   type: gpu
diff --git a/expts/neurips2023_configs/config_large_gin.yaml b/expts/neurips2023_configs/config_large_gin.yaml
index eb2a612d2..c7d37c58c 100644
--- a/expts/neurips2023_configs/config_large_gin.yaml
+++ b/expts/neurips2023_configs/config_large_gin.yaml
@@ -1,26 +1,11 @@
 # Running the gin model with the largemix dataset on IPU.
+defaults:
+  - base_config: large
+  - _self_
+
 constants:
-  name: &name neurips2023_large_data_gin
-  config_override: "expts/neurips2023_configs/config_large_gcn.yaml"
+  name: neurips2023_large_data_gin
 
 architecture:
   gnn:  # Set as null to avoid a post-nn network
-    out_dim: &gnn_dim 704
-    layer_type: 'pyg:gin' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-    hidden_dims: *gnn_dim
-
-  graph_output_nn:
-    graph:
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-    node:
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-large-gin/
-    filename: *name
+    layer_type: 'pyg:gin'
\ No newline at end of file
diff --git a/expts/neurips2023_configs/config_large_gine.yaml b/expts/neurips2023_configs/config_large_gine.yaml
index 18915d5bb..2278f8422 100644
--- a/expts/neurips2023_configs/config_large_gine.yaml
+++ b/expts/neurips2023_configs/config_large_gine.yaml
@@ -1,7 +1,11 @@
 # Running the gine model with the largemix dataset on IPU.
+
+defaults:
+  - base_config: large
+  - _self_
+
 constants:
-  name: &name neurips2023_large_data_gine
-  config_override: "expts/neurips2023_configs/config_large_gcn.yaml"
+  name: neurips2023_large_data_gine
 
 architecture:
   pre_nn_edges:   # Set as null to avoid a pre-nn network
@@ -10,15 +14,15 @@ architecture:
     depth: 2
     activation: relu
     last_activation: none
-    dropout: 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
+    dropout: ${architecture.pre_nn.dropout}
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
-  gnn:  # Set as null to avoid a post-nn network
+  gnn:
     out_dim: &gnn_dim 704
     hidden_dims: *gnn_dim
-    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+    layer_type: 'pyg:gine'
 
   graph_output_nn:
     graph:
@@ -26,12 +30,4 @@ architecture:
       hidden_dims: *gnn_dim
     node:
       out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-large-gine/
-    filename: *name
+      hidden_dims: *gnn_dim
\ No newline at end of file
diff --git a/expts/neurips2023_configs/config_large_mpnn.yaml b/expts/neurips2023_configs/config_large_mpnn.yaml
index 213c75e97..b56f8dd2d 100644
--- a/expts/neurips2023_configs/config_large_mpnn.yaml
+++ b/expts/neurips2023_configs/config_large_mpnn.yaml
@@ -1,12 +1,12 @@
 # Testing the mpnn only model with the PCQMv2 dataset on IPU.
+
+defaults:
+ - base_config: large
+
 constants:
-  name: &name neurips2023_large_data_mpnn
-  config_override: "expts/neurips2023_configs/config_large_gcn.yaml"
+  name: neurips2023_large_data_mpnn
 
 architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-
   pre_nn_edges:   # Set as null to avoid a pre-nn network
     out_dim: 32
     hidden_dims: 128
@@ -14,32 +14,14 @@ architecture:
     activation: relu
     last_activation: none
     dropout: 0.18
-    normalization: layer_norm
-    last_normalization: layer_norm
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
   gnn:  # Set as null to avoid a post-nn network
     out_dim: &gnn_dim 64
     hidden_dims: *gnn_dim
-    layer_type: 'pyg:mpnnplus' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+    layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
     layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
-      node_residual: false
       mpnn_type: 'pyg:mpnnplus'
-      in_dim_edges: 32
       out_dim_edges: 32
-
-  graph_output_nn:
-    graph:
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-    node:
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-large-mpnn/
-    filename: *name
diff --git a/expts/neurips2023_configs/config_small_gated_gcn.yaml b/expts/neurips2023_configs/config_small_gated_gcn.yaml
index db3d08ba4..8e00d26f6 100644
--- a/expts/neurips2023_configs/config_small_gated_gcn.yaml
+++ b/expts/neurips2023_configs/config_small_gated_gcn.yaml
@@ -1,7 +1,11 @@
 # Testing the gated_gcn model with the PCQMv2 dataset on IPU.
+
+defaults:
+  - base_config: small
+  - _self_
+
 constants:
-  name: &name neurips2023_small_data_gated_gcn
-  config_override: "expts/neurips2023_configs/config_small_gcn.yaml"
+  name: neurips2023_small_data_gated_gcn
 
 architecture:
   pre_nn_edges:   # Set as null to avoid a pre-nn network
@@ -10,19 +14,10 @@ architecture:
     depth: 2
     activation: relu
     last_activation: none
-    dropout: 0.18
-    normalization: layer_norm
-    last_normalization: layer_norm
+    dropout: ${architecture.pre_nn.dropout}
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
-
   gnn:  # Set as null to avoid a post-nn network
     layer_type: 'pyg:gated-gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-gated-gcn/
-    filename: *name
diff --git a/expts/neurips2023_configs/config_small_gcn.yaml b/expts/neurips2023_configs/config_small_gcn.yaml
index fffffbefd..114ce26dc 100644
--- a/expts/neurips2023_configs/config_small_gcn.yaml
+++ b/expts/neurips2023_configs/config_small_gcn.yaml
@@ -1,345 +1,12 @@
-# Testing the gcn model with the PCQMv2 dataset on IPU.
-constants:
-  name: &name neurips2023_small_data_gcn
-  seed: &seed 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  entity: multitask-gnn
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 80
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 80
-        # Data handling-related
-        batch_size_training: 50
-        batch_size_inference: 50
-    predictor:
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16
-        accumulate_grad_batches: 4
-
-  ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
-    - replicationFactor(1)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 128)
-    - Precision.enableStochasticRounding(True)
-
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       batch_size_training: 64
-#       batch_size_inference: 256
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      qm9:
-        df: null
-        df_path: data/neurips2023/small-dataset/qm9.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt`
-        seed: *seed
-        task_level: graph
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
+# Testing the gcn model with the toymix dataset on IPU.
 
-      tox21:
-        df: null
-        df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt`
-        seed: *seed
-        task_level: graph
-
-      zinc:
-        df: null
-        df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
-        # or set path as the URL directly
-        smiles_col: "smiles"
-        label_cols: ["SA", "logp", "score"]
-        # sample_size: 2000 # use sample_size for test
-        splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt`
-        seed: *seed
-        task_level: graph
-        label_normalization:
-          normalize_val_test: True
-          method: "normal"
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    processed_graph_data_path: "../datacache/neurips2023-small/"
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          lap_eigvec:
-            pos_level: node
-            pos_type: laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          lap_eigval:
-            pos_level: node
-            pos_type: laplacian_eigval
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_level: node
-            pos_type: rw_return_probs
-            ksteps: 16
-
-    # cache_data_path: .
-    num_workers: 30 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
+defaults:
+  - base_config: small
+  - _self_
 
+constants:
+  name: neurips2023_small_data_gcn
 
 architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 64
-    hidden_dims: 256
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges: null   # Set as null to avoid a pre-nn network
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.1
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rw_return_probs"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.1
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-
-
-
   gnn:  # Set as null to avoid a post-nn network
-    in_dim: 64 # or otherwise the correct value
-    out_dim: &gnn_dim 96
-    hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
     layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-    layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1
-
-
-  graph_output_nn:
-    graph:
-      pooling: [sum]
-      out_dim: *gnn_dim
-      hidden_dims: *gnn_dim
-      depth: 1
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-  task_heads:
-    qm9:
-      task_level: graph
-      out_dim: 19
-      hidden_dims: 128
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    tox21:
-      task_level: graph
-      out_dim: 12
-      hidden_dims: 64
-      depth: 2
-      activation: relu
-      last_activation: sigmoid
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-    zinc:
-      task_level: graph
-      out_dim: 3
-      hidden_dims: 32
-      depth: 2
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    qm9: ["mae"]
-    tox21: ["auroc"]
-    zinc: ["mae"]
-  loss_fun:
-    qm9: mae_ipu
-    tox21: bce_ipu
-    zinc: mae_ipu
-  random_seed: *seed
-  optim_kwargs:
-    lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor qm9/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label
-  multitask_handling: flatten # flatten, mean-per-label
-
-# Task-specific
-metrics:
-  qm9: &qm9_metrics
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: flatten
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-    - name: r2_score
-      metric: r2_score_ipu
-      target_nan_mask: null
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-  tox21:
-    - name: auroc
-      metric: auroc_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: avpr
-      metric: average_precision_ipu
-      task: binary
-      multitask_handling: mean-per-label
-      threshold_kwargs: null
-    - name: f1 > 0.5
-      metric: f1
-      multitask_handling: mean-per-label
-      target_to_int: True
-      num_classes: 2
-      average: micro
-      threshold_kwargs: &threshold_05
-        operator: greater
-        threshold: 0.5
-        th_on_preds: True
-        th_on_target: True
-    - name: precision > 0.5
-      metric: precision
-      multitask_handling: mean-per-label
-      average: micro
-      threshold_kwargs: *threshold_05
-  zinc: *qm9_metrics
-
-trainer:
-  seed: *seed
-  logger:
-    save_dir: logs/neurips2023-small/
-    name: *name
-    project: *name
-  #early_stopping:
-  #  monitor: *monitor
-  #  min_delta: 0
-  #  patience: 10
-  #  mode: &mode min
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-gcn/
-    filename: *name
-    # monitor: *monitor
-    # mode: *mode
-    # save_top_k: 1
-    save_last: True
-  trainer:
-    max_epochs: *max_epochs
-    min_epochs: 1
-    check_val_every_n_epoch: 20
diff --git a/expts/neurips2023_configs/config_small_gcn_gpu.yaml b/expts/neurips2023_configs/config_small_gcn_gpu.yaml
index bf2a03a3b..8b5a46e26 100644
--- a/expts/neurips2023_configs/config_small_gcn_gpu.yaml
+++ b/expts/neurips2023_configs/config_small_gcn_gpu.yaml
@@ -1,7 +1,15 @@
 # Testing GCN on ToyMix with FP16/32 on GPU
+
+defaults:
+  - base_config: small
+  - _self_
+
 constants:
-  name: &name neurips2023_small_data_gcn_gpu
-  config_override: "expts/neurips2023_configs/config_small_gcn.yaml"
+  name: neurips2023_small_data_gcn_gpu
+
+architecture:
+  gnn:  # Set as null to avoid a post-nn network
+    layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
 
 accelerator:
   type: gpu  # cpu or ipu or gpu
diff --git a/expts/neurips2023_configs/config_small_gin.yaml b/expts/neurips2023_configs/config_small_gin.yaml
index a22a4e6a6..e018f722a 100644
--- a/expts/neurips2023_configs/config_small_gin.yaml
+++ b/expts/neurips2023_configs/config_small_gin.yaml
@@ -1,16 +1,12 @@
 # Testing the gin model with the PCQMv2 dataset on IPU.
+
+defaults:
+  - base_config: small
+  - _self_
+
 constants:
-  name: &name neurips2023_small_data_gin
-  config_override: "expts/neurips2023_configs/config_small_gcn.yaml"
+  name: neurips2023_small_data_gin
 
 architecture:
   gnn:  # Set as null to avoid a post-nn network
-    layer_type: 'pyg:gin' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-gin/
-    filename: *name
+    layer_type: 'pyg:gin'
diff --git a/expts/neurips2023_configs/config_small_gine.yaml b/expts/neurips2023_configs/config_small_gine.yaml
index d25f89a72..111bebbc2 100644
--- a/expts/neurips2023_configs/config_small_gine.yaml
+++ b/expts/neurips2023_configs/config_small_gine.yaml
@@ -1,7 +1,11 @@
 # Testing the gine model with the PCQMv2 dataset on IPU.
+
+defaults:
+  - base_config: small
+  - _self_
+
 constants:
-  name: &name neurips2023_small_data_gine
-  config_override: "expts/neurips2023_configs/config_small_gcn.yaml"
+  name: neurips2023_small_data_gine
 
 architecture:
   pre_nn_edges:   # Set as null to avoid a pre-nn network
@@ -10,18 +14,10 @@ architecture:
     depth: 2
     activation: relu
     last_activation: none
-    dropout: 0.1
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
+    dropout: ${architecture.pre_nn.dropout}
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
   gnn:  # Set as null to avoid a post-nn network
-    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-gine/
-    filename: *name
+    layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
\ No newline at end of file
diff --git a/expts/neurips2023_configs/config_small_mpnn.yaml b/expts/neurips2023_configs/config_small_mpnn.yaml
index d7c862359..357a8f560 100644
--- a/expts/neurips2023_configs/config_small_mpnn.yaml
+++ b/expts/neurips2023_configs/config_small_mpnn.yaml
@@ -1,7 +1,10 @@
 # Testing the mpnn only model with the PCQMv2 dataset on IPU.
+
+defaults:
+ - base_config: small
+
 constants:
-  name: &name neurips2023_small_data_mpnn
-  config_override: "expts/neurips2023_configs/config_small_gcn.yaml"
+  name: neurips2023_small_data_mpnn
 
 architecture:
   pre_nn_edges:   # Set as null to avoid a pre-nn network
@@ -10,33 +13,15 @@ architecture:
     depth: 2
     activation: relu
     last_activation: none
-    dropout: 0.18
-    normalization: layer_norm
-    last_normalization: layer_norm
+    dropout: ${architecture.pre_nn.dropout}
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
     residual_type: none
 
   gnn:  # Set as null to avoid a post-nn network
     out_dim: &gnn_dim 64
     hidden_dims: *gnn_dim
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    virtual_node: 'none'
-    layer_type: 'pyg:mpnnplus' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
+    layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
     layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
-      in_dim: *gnn_dim
-      out_dim: *gnn_dim
-      in_dim_edges: 32
+      mpnn_type: 'pyg:mpnnplus'
       out_dim_edges: 32
-
-trainer:
-  logger:
-    name: *name
-    project: *name
-  model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-mpnn/
-    filename: *name
diff --git a/expts/neurips2023_configs/debug/config_debug.yaml b/expts/neurips2023_configs/debug/config_debug.yaml
index 5e078d7c4..a323427e5 100644
--- a/expts/neurips2023_configs/debug/config_debug.yaml
+++ b/expts/neurips2023_configs/debug/config_debug.yaml
@@ -3,6 +3,7 @@ constants:
   name: &name neurips2023_small_data_mpnn
   seed: &seed 999
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
 
 # accelerator:
 #   type: ipu  # cpu or ipu or gpu
@@ -108,7 +109,6 @@ datamodule:
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture:
diff --git a/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml b/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml
index 1b4171a43..717ae0675 100644
--- a/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml
+++ b/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml
@@ -3,6 +3,7 @@ constants:
   name: &name neurips2023_small_data_gcn
   seed: &seed 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -122,7 +123,6 @@ datamodule:
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture: